In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [2]:
#Reading the training and testing data
df1 = pd.read_csv('training.csv')
df2 = pd.read_csv('test.csv')

In [3]:
df1.head()

Unnamed: 0,article_number,article_words,topic
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",FOREX MARKETS
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",MONEY MARKETS
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",SPORTS
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",FOREX MARKETS
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",IRRELEVANT


In [4]:
df2.head()

Unnamed: 0,article_number,article_words,topic
0,9501,"world,complet,pharmaceut,tianjin,tianjin,chin,...",IRRELEVANT
1,9502,"copy,sunday,weekend,ec,friday,eu,includ,limit,...",IRRELEVANT
2,9503,"heavy,heavy,gabriel,morn,morn,equit,cent,cent,...",FOREX MARKETS
3,9504,"research,jess,hit,anticip,comput,comput,comput...",IRRELEVANT
4,9505,"provid,provid,luxembourg,court,court,case,opin...",IRRELEVANT


In [5]:
#One Hot Encoding to quantitatively represent the topics
encoding = {'topic' : {'IRRELEVANT' : 0, 'ARTS' : 1, 'BIOGRAPHIES' : 2, 'DEFENCE' : 3, 'DOMESTIC MARKETS' : 4, 'FOREX MARKETS' : 5, 'HEALTH' : 6, 'MONEY MARKETS' : 7,'SCIENCE AND TECHNOLOGY' : 8, 'SHARE LISTINGS' : 9, 'SPORTS' :10,'BIOGRAPHIES PERSONALITIES PEOPLE':2,'ARTS CULTURE ENTERTAINMENT':1}}
encoding

{'topic': {'IRRELEVANT': 0,
  'ARTS': 1,
  'BIOGRAPHIES': 2,
  'DEFENCE': 3,
  'DOMESTIC MARKETS': 4,
  'FOREX MARKETS': 5,
  'HEALTH': 6,
  'MONEY MARKETS': 7,
  'SCIENCE AND TECHNOLOGY': 8,
  'SHARE LISTINGS': 9,
  'SPORTS': 10,
  'BIOGRAPHIES PERSONALITIES PEOPLE': 2,
  'ARTS CULTURE ENTERTAINMENT': 1}}

In [6]:
#Replacing topics with the relevant numbers for training data
df1 = df1.replace(encoding)
df1.head()

Unnamed: 0,article_number,article_words,topic
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",5
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",7
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",10
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",5
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",0


In [7]:
#Replacing topics with the relevant numbers for testing data
df2 = df2.replace(encoding)
df2.head()

Unnamed: 0,article_number,article_words,topic
0,9501,"world,complet,pharmaceut,tianjin,tianjin,chin,...",0
1,9502,"copy,sunday,weekend,ec,friday,eu,includ,limit,...",0
2,9503,"heavy,heavy,gabriel,morn,morn,equit,cent,cent,...",5
3,9504,"research,jess,hit,anticip,comput,comput,comput...",0
4,9505,"provid,provid,luxembourg,court,court,case,opin...",0


In [8]:
data = pd.concat([df1['article_words'],df2['article_words']])

In [9]:
len(data)

10000

In [10]:
# Create bag of words
count = TfidfVectorizer()
bag_of_words = count.fit(data)

In [11]:
#Training and Testing split - X and Y
x_train = df1['article_words']
y_train = df1['topic'].to_list()
x_test = df2['article_words']
y_test = df2['topic'].to_list()

In [12]:
#Transforming the testing and training
x_train = bag_of_words.transform(x_train)
x_test = bag_of_words.transform(x_test)

In [13]:
#Define the clasifier and fit the data
from sklearn.svm import LinearSVC
classifier = LinearSVC(dual=False)
model = classifier.fit(x_train,y_train)

In [14]:
#Predicting the data
y_predict = model.predict(x_test)

In [15]:
train_accuracy_score = accuracy_score(y_train,model.predict(x_train))
test_accuracy_score = accuracy_score(y_test,model.predict(x_test))
print(f'Accuracy Score for training data : {train_accuracy_score}.\n')
print(f'Accuracy Score for testing data : {test_accuracy_score}.\n')

Accuracy Score for training data : 0.9501052631578948.

Accuracy Score for testing data : 0.772.



In [16]:
#Classification Report and Metrics Report
print(precision_score(y_test, y_predict,average = 'micro'))
print(recall_score(y_test, y_predict,average='micro'))
print(f1_score(y_test, y_predict, average='micro'))
print(f1_score(y_test, y_predict, average='macro'))
print(classification_report(y_test, y_predict))

0.772
0.772
0.772
0.5725405976822174
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       266
           1       0.33      0.67      0.44         3
           2       1.00      0.20      0.33        15
           3       0.90      0.69      0.78        13
           4       0.67      1.00      0.80         2
           5       0.44      0.33      0.38        48
           6       0.69      0.64      0.67        14
           7       0.55      0.64      0.59        69
           8       0.00      0.00      0.00         3
           9       0.50      0.43      0.46         7
          10       0.95      0.97      0.96        60

    accuracy                           0.77       500
   macro avg       0.63      0.59      0.57       500
weighted avg       0.77      0.77      0.76       500

