In [1]:
#Importing the required models 
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [2]:
#Reading the training and testing data
df1 = pd.read_csv('training.csv')
df2 = pd.read_csv('test.csv')

In [3]:
df1.head()

Unnamed: 0,article_number,article_words,topic
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",FOREX MARKETS
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",MONEY MARKETS
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",SPORTS
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",FOREX MARKETS
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",IRRELEVANT


In [4]:
df2.head()

Unnamed: 0,article_number,article_words,topic
0,9501,"world,complet,pharmaceut,tianjin,tianjin,chin,...",IRRELEVANT
1,9502,"copy,sunday,weekend,ec,friday,eu,includ,limit,...",IRRELEVANT
2,9503,"heavy,heavy,gabriel,morn,morn,equit,cent,cent,...",FOREX MARKETS
3,9504,"research,jess,hit,anticip,comput,comput,comput...",IRRELEVANT
4,9505,"provid,provid,luxembourg,court,court,case,opin...",IRRELEVANT


In [3]:
#One Hot Encoding to quantitatively represent the topics
encoding = {'topic' : {'IRRELEVANT' : 0, 'ARTS' : 1, 'BIOGRAPHIES' : 2, 'DEFENCE' : 3, 'DOMESTIC MARKETS' : 4, 'FOREX MARKETS' : 5, 'HEALTH' : 6, 'MONEY MARKETS' : 7,'SCIENCE AND TECHNOLOGY' : 8, 'SHARE LISTINGS' : 9, 'SPORTS' :10,'BIOGRAPHIES PERSONALITIES PEOPLE':2,'ARTS CULTURE ENTERTAINMENT':1}}
encoding

{'topic': {'IRRELEVANT': 0,
  'ARTS': 1,
  'BIOGRAPHIES': 2,
  'DEFENCE': 3,
  'DOMESTIC MARKETS': 4,
  'FOREX MARKETS': 5,
  'HEALTH': 6,
  'MONEY MARKETS': 7,
  'SCIENCE AND TECHNOLOGY': 8,
  'SHARE LISTINGS': 9,
  'SPORTS': 10,
  'BIOGRAPHIES PERSONALITIES PEOPLE': 2,
  'ARTS CULTURE ENTERTAINMENT': 1}}

In [4]:
#Replacing topics with the relevant numbers for training data
df1 = df1.replace(encoding)
df1.head()

Unnamed: 0,article_number,article_words,topic
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",5
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",7
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",10
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",5
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",0


In [5]:
#Replacing topics with the relevant numbers for testing data
df2 = df2.replace(encoding)
df2.head()

Unnamed: 0,article_number,article_words,topic
0,9501,"world,complet,pharmaceut,tianjin,tianjin,chin,...",0
1,9502,"copy,sunday,weekend,ec,friday,eu,includ,limit,...",0
2,9503,"heavy,heavy,gabriel,morn,morn,equit,cent,cent,...",5
3,9504,"research,jess,hit,anticip,comput,comput,comput...",0
4,9505,"provid,provid,luxembourg,court,court,case,opin...",0


In [6]:
#concatenating the article_words of training and test data for vectorization
data = pd.concat([df1['article_words'],df2['article_words']])

In [9]:
len(data)

10000

In [22]:
# Create bag of words
ngram_range = (1,2)
min_df = 10    #When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
max_df = 1.   #max_df: When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
max_features = 1000
#TFID Vectorizer with the paramters
count= TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
#Fitting the article_words of training data into bag of words 
bag_of_words = count.fit(df1['article_words'])

In [23]:
#Training and Testing split - X and Y
x_train = df1['article_words']
y_train = df1['topic'].to_list()
x_test = df2['article_words']
y_test = df2['topic'].to_list()

In [24]:
#Transforming the testing and training
x_train = bag_of_words.transform(x_train)
x_test = bag_of_words.transform(x_test)

In [29]:
#Define the clasifier and fit the data
from sklearn.svm import LinearSVC
classifier = LinearSVC(dual=False,random_state=0, tol=1e-5)
model = classifier.fit(x_train,y_train)

In [30]:
#Predicting the data
y_predict = model.predict(x_test)

In [31]:
#calculating the accuracy of training and test data before cross validation
train_accuracy_score = accuracy_score(y_train,model.predict(x_train))
test_accuracy_score = accuracy_score(y_test,model.predict(x_test))
print(f'Accuracy Score for training data : {train_accuracy_score}.\n')
print(f'Accuracy Score for testing data : {test_accuracy_score}.\n')

Accuracy Score for training data : 0.8678947368421053.

Accuracy Score for testing data : 0.766.



In [32]:
#Classification Report and Metrics Report bef0re hyper parameter tuning
print(precision_score(y_test, y_predict,average = 'micro'))
print(recall_score(y_test, y_predict,average='micro'))
print(f1_score(y_test, y_predict, average='micro'))
print(f1_score(y_test, y_predict, average='macro'))
print(classification_report(y_test, y_predict))

0.766
0.766
0.766
0.5574626223521341
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       266
           1       0.33      0.67      0.44         3
           2       1.00      0.20      0.33        15
           3       1.00      0.54      0.70        13
           4       0.33      0.50      0.40         2
           5       0.50      0.35      0.41        48
           6       0.88      0.50      0.64        14
           7       0.51      0.62      0.56        69
           8       0.33      0.33      0.33         3
           9       0.50      0.43      0.46         7
          10       0.97      0.98      0.98        60

    accuracy                           0.77       500
   macro avg       0.65      0.55      0.56       500
weighted avg       0.78      0.77      0.76       500



In [21]:
#Performing cross validation using GridSearchCV() and printing out the best estimator
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['linear']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(x_train,y_train)
print(grid.best_estimator_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .................... C=0.1, gamma=1, kernel=linear, total=  31.8s
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   31.8s remaining:    0.0s


[CV] .................... C=0.1, gamma=1, kernel=linear, total=  30.7s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .................... C=0.1, gamma=1, kernel=linear, total=  31.0s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] .................. C=0.1, gamma=0.1, kernel=linear, total=  30.7s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] .................. C=0.1, gamma=0.1, kernel=linear, total=  30.8s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] .................. C=0.1, gamma=0.1, kernel=linear, total=  30.7s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] ................. C=0.1, gamma=0.01, kernel=linear, total=  30.8s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] ................. C=0.1, gamma=0.01, kernel=linear, total=  30.6s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 22.8min finished


SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [25]:
#Re-training the classifier after hyper paramter tuning
from sklearn.svm import SVC 
classifier = SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
model = classifier.fit(x_train,y_train)

In [26]:
#Predicting the data
y_predict = model.predict(x_test)

In [27]:
#Calculating the accuracy score for the training and test data after hyper paramter tuning
train_accuracy_score = accuracy_score(y_train,model.predict(x_train))
test_accuracy_score = accuracy_score(y_test,model.predict(x_test))
print(f'Accuracy Score for training data : {train_accuracy_score}.\n')
print(f'Accuracy Score for testing data : {test_accuracy_score}.\n')

Accuracy Score for training data : 0.8493684210526316.

Accuracy Score for testing data : 0.766.



In [28]:
#Classification report after hyper parameter tuning
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87       266
           1       1.00      0.33      0.50         3
           2       1.00      0.27      0.42        15
           3       1.00      0.62      0.76        13
           4       0.33      0.50      0.40         2
           5       0.47      0.33      0.39        48
           6       0.80      0.57      0.67        14
           7       0.52      0.64      0.57        69
           8       0.00      0.00      0.00         3
           9       0.50      0.43      0.46         7
          10       0.95      0.98      0.97        60

    accuracy                           0.77       500
   macro avg       0.67      0.51      0.55       500
weighted avg       0.77      0.77      0.76       500

