In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC 
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,\
classification_report,cohen_kappa_score,make_scorer,roc_auc_score

In [2]:
#Reading the training and testing data
df1 = pd.read_csv('training.csv')
df2 = pd.read_csv('test.csv')

In [3]:
#One Hot Encoding to quantitatively represent the topics
encoding = {'topic' : 
            {'IRRELEVANT' : 0,
             'ARTS CULTURE ENTERTAINMENT':1,
             'BIOGRAPHIES PERSONALITIES PEOPLE':2,
             'DEFENCE' : 3, 
             'DOMESTIC MARKETS' : 4, 
             'FOREX MARKETS' : 5, 
             'HEALTH' : 6, 
             'MONEY MARKETS' : 7,
             'SCIENCE AND TECHNOLOGY' : 8, 
             'SHARE LISTINGS' : 9, 
             'SPORTS' :10,
             }}
encoding

{'topic': {'IRRELEVANT': 0,
  'ARTS CULTURE ENTERTAINMENT': 1,
  'BIOGRAPHIES PERSONALITIES PEOPLE': 2,
  'DEFENCE': 3,
  'DOMESTIC MARKETS': 4,
  'FOREX MARKETS': 5,
  'HEALTH': 6,
  'MONEY MARKETS': 7,
  'SCIENCE AND TECHNOLOGY': 8,
  'SHARE LISTINGS': 9,
  'SPORTS': 10}}

In [50]:
#Replacing topics with the relevant numbers for training data
#df1 = df1.replace(encoding)
df1.head()

Unnamed: 0,article_number,article_words,topic
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",FOREX MARKETS
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",MONEY MARKETS
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",SPORTS
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",FOREX MARKETS
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",IRRELEVANT


In [51]:
#Replacing topics with the relevant numbers for testing data
#y_test_name = df2['topic']
#df2 = df2.replace(encoding)
df2.head()

Unnamed: 0,article_number,article_words,topic
0,9501,"world,complet,pharmaceut,tianjin,tianjin,chin,...",IRRELEVANT
1,9502,"copy,sunday,weekend,ec,friday,eu,includ,limit,...",IRRELEVANT
2,9503,"heavy,heavy,gabriel,morn,morn,equit,cent,cent,...",FOREX MARKETS
3,9504,"research,jess,hit,anticip,comput,comput,comput...",IRRELEVANT
4,9505,"provid,provid,luxembourg,court,court,case,opin...",IRRELEVANT


In [4]:
# Create bag of words
ngram_range = (1,2)
min_df = 10    #When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
max_df = 1.   #max_df: When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
max_features = 1500
#count = TfidfVectorizer()
count= TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True,
                        use_idf=True)
bag_of_words = count.fit(df1['article_words'])

In [5]:
#Training and Testing split - X and Y
x_train = df1['article_words']
y_train = df1['topic'].to_list()
x_test = df2['article_words']
y_test = df2['topic'].to_list()

In [6]:
#Transforming the testing and training
x_train = bag_of_words.transform(x_train)
x_test = bag_of_words.transform(x_test)

In [7]:

classifier = SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
model = classifier.fit(x_train,y_train)


## Cross Validation on selecting metrics

In [36]:
classifiers = [
    SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
        decision_function_shape='ovr', degree=3, gamma=0.1, kernel='sigmoid',
        max_iter=-1, probability=False, random_state=None, shrinking=True,
        tol=0.001, verbose=False),
    RFC(n_estimators=200, #the number of trees in the forest 9
         criterion='gini', #a way to decide the attribute
         max_depth=80, #max depth of the tree
         min_samples_split=9, #The minimum number of samples required to split an internal node
         min_samples_leaf=1, #The minimum number of samples required to be at a leaf node
         max_features='sqrt', #The number of features to consider when looking for the best split:
         max_leaf_nodes=None,
         bootstrap=False,
         warm_start=False), #When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.
    MultinomialNB(alpha=1.0, class_prior=None, fit_prior='True'),
    KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='distance')
]

def cohen(y_true,y_predict): return cohen_kappa_score(y_true,y_predict)
def accuracy(y_true,y_predict): return accuracy_score(y_true,y_predict)
def precision(y_true,y_predict): return precision_score(y_true,y_predict,average='macro')
def recall(y_true,y_predict): return recall_score(y_true,y_predict,average='macro')
#def roc_auc(y_true,y_predict): return roc_auc_score(y_true,y_predict,average='macro')
scoring = {'cohen':make_scorer(cohen),'accuracy':make_scorer(accuracy),
           'precision':make_scorer(precision),'recall':make_scorer(recall)}
classifier_accuracy_list = []
for i, classifier in enumerate(classifiers):
        # split the dataset into 5 folds; then test the classifier against each fold one by one
        accuracies = cross_validate(classifier, x_train, y_train, cv=5, scoring=scoring)
        #classifier_accuracy_list.append((accuracies.mean(), type(classifier).__name__))
        classifier_accuracy_list.append((accuracies, type(classifier).__name__))        

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [50]:
for e in classifier_accuracy_list:
    print(f"classifier {e[1]} \n cohen_mean:{np.mean(e[0]['test_cohen'])},accuracy_mean: {np.mean(e[0]['test_accuracy'])},\
                                 precision_mean:{np.mean(e[0]['test_precision'])},recall_mean:{np.mean(e[0]['test_recall'])}")

classifier SVC 
 cohen_mean:0.6715813798798351,accuracy_mean: 0.7766303887840691,                                 precision_mean:0.7150691485941267,recall_mean:0.557924061423529
classifier RandomForestClassifier 
 cohen_mean:0.6213879156962081,accuracy_mean: 0.7510590966940207,                                 precision_mean:0.732372305476205,recall_mean:0.4034624129358716
classifier MultinomialNB 
 cohen_mean:0.6152752074072937,accuracy_mean: 0.7410603270410656,                                 precision_mean:0.5870507002421852,recall_mean:0.40425414539714444
classifier KNeighborsClassifier 
 cohen_mean:0.6450623282008181,accuracy_mean: 0.7536841582707059,                                 precision_mean:0.6887429887373889,recall_mean:0.5476950469432813


[({'fit_time': array([15.2449069 , 15.34216809, 15.0947392 , 15.13932395, 15.12666702]),
   'score_time': array([16.63713312, 16.85792518, 17.07731414, 17.06618714, 16.90793872]),
   'test_cohen': array([0.66805066, 0.68037011, 0.66861481, 0.67099632, 0.66987501]),
   'test_accuracy': array([0.77573529, 0.78098739, 0.77327722, 0.77689873, 0.7762533 ]),
   'test_precision': array([0.6731202 , 0.72841684, 0.69689268, 0.72329267, 0.75362335]),
   'test_recall': array([0.53541098, 0.58114242, 0.56374538, 0.55980345, 0.54951807])},
  'SVC'),
 ({'fit_time': array([23.03527713, 22.7660141 , 23.00905919, 22.60920095, 22.61260986]),
   'score_time': array([0.88352299, 0.91984105, 0.86381316, 0.83330607, 0.83615112]),
   'test_cohen': array([0.62695393, 0.61357241, 0.61211948, 0.6274649 , 0.62682885]),
   'test_accuracy': array([0.75472689, 0.74369748, 0.74539716, 0.75685654, 0.75461741]),
   'test_precision': array([0.70966642, 0.70601845, 0.74182318, 0.74271743, 0.76163605]),
   'test_recall':

In [21]:
#Predicting the data
y_predict = model.predict(x_test)

In [28]:
train_accuracy_score = accuracy_score(y_train,model.predict(x_train))
test_accuracy_score = accuracy_score(y_test,model.predict(x_test))
print(f'Accuracy Score for training data : {train_accuracy_score}.\n')
print(f'Accuracy Score for testing data : {test_accuracy_score}.\n')

Accuracy Score for training data : 0.8671578947368421.

Accuracy Score for testing data : 0.758.



In [63]:
#Classification Report and Metrics Report
print(precision_score(y_test, y_predict,average = 'micro'))
print(recall_score(y_test, y_predict,average='micro'))
print(f1_score(y_test, y_predict, average='micro'))
print(f1_score(y_test, y_predict, average='macro'))
print(classification_report(y_test, y_predict))

0.764
0.764
0.764
0.5104398689123325
                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.50      0.67      0.57         3
BIOGRAPHIES PERSONALITIES PEOPLE       0.75      0.20      0.32        15
                         DEFENCE       0.78      0.54      0.64        13
                DOMESTIC MARKETS       0.00      0.00      0.00         2
                   FOREX MARKETS       0.52      0.35      0.42        48
                          HEALTH       0.83      0.71      0.77        14
                      IRRELEVANT       0.85      0.88      0.87       266
                   MONEY MARKETS       0.49      0.67      0.57        69
          SCIENCE AND TECHNOLOGY       0.00      0.00      0.00         3
                  SHARE LISTINGS       0.60      0.43      0.50         7
                          SPORTS       0.95      0.98      0.97        60

                        accuracy                           0.76       500

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [21]:
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['linear']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(x_train,y_train)
print(grid.best_estimator_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .................... C=0.1, gamma=1, kernel=linear, total=  31.8s
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   31.8s remaining:    0.0s


[CV] .................... C=0.1, gamma=1, kernel=linear, total=  30.7s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .................... C=0.1, gamma=1, kernel=linear, total=  31.0s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] .................. C=0.1, gamma=0.1, kernel=linear, total=  30.7s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] .................. C=0.1, gamma=0.1, kernel=linear, total=  30.8s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] .................. C=0.1, gamma=0.1, kernel=linear, total=  30.7s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] ................. C=0.1, gamma=0.01, kernel=linear, total=  30.8s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] ................. C=0.1, gamma=0.01, kernel=linear, total=  30.6s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 22.8min finished


SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [64]:
from sklearn.svm import SVC 
classifier = SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
model = classifier.fit(x_train,y_train)

In [65]:
#Predicting the data
y_predict = model.predict(x_test)

In [27]:
train_accuracy_score = accuracy_score(y_train,model.predict(x_train))
test_accuracy_score = accuracy_score(y_test,model.predict(x_test))
print(f'Accuracy Score for training data : {train_accuracy_score}.\n')
print(f'Accuracy Score for testing data : {test_accuracy_score}.\n')

Accuracy Score for training data : 0.8493684210526316.

Accuracy Score for testing data : 0.766.



In [66]:
print(classification_report(y_test, y_predict))

                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.50      0.67      0.57         3
BIOGRAPHIES PERSONALITIES PEOPLE       0.75      0.20      0.32        15
                         DEFENCE       0.78      0.54      0.64        13
                DOMESTIC MARKETS       0.00      0.00      0.00         2
                   FOREX MARKETS       0.52      0.35      0.42        48
                          HEALTH       0.83      0.71      0.77        14
                      IRRELEVANT       0.85      0.88      0.87       266
                   MONEY MARKETS       0.49      0.67      0.56        69
          SCIENCE AND TECHNOLOGY       0.00      0.00      0.00         3
                  SHARE LISTINGS       0.60      0.43      0.50         7
                          SPORTS       0.95      0.98      0.97        60

                        accuracy                           0.76       500
                       macro avg    

## Cosine similarity

In [108]:
#ds = pd.read_csv("/home/nikita/Downloads/sample-data.csv")
def recommendation(test,y_predict,train_data,topic_dict,topic_list):
    test_data = test.copy(deep=True)
    test_data['topic'] = y_predict
    #topic_list = [i for i in range(11)]
    tf = TfidfVectorizer()
    model = tf.fit(train_data['article_words'])
    article_list = []
    for t in topic_list:
        if not test_data[test_data['topic']==t].empty and t != 'IRRELEVANT':        
            tfidf_train = model.transform(train_data[train_data['topic']==t]['article_words']).toarray()
            tfidf_test = model.transform(test_data[test_data['topic']==t]['article_words']).toarray()
            test_article_number = test_data[test_data['topic']==t]['article_number'].tolist()
            cosine = cosine_similarity(tfidf_test,tfidf_train)
            cosine = np.sort(cosine)
            suggested_list = np.argsort(cosine[:,-1])[-10:].tolist()
            article_number = [test_article_number[i] for i in suggested_list]
            article_list += article_number
            article_number = ",".join([str(test_article_number[i]) for i in suggested_list])         
            print(f"For topic {t} recommending article {article_number}")
    y_rec_true = [(test[test['article_number']==i]['topic']).tolist()[0] for i in article_list]
    y_rec_predict = [(test_data[test_data['article_number']==i]['topic']).tolist()[0] for i in article_list]
    return y_rec_true,y_rec_predict

In [113]:
topic_dict = {encoding['topic'][k] : k for k in encoding['topic']}
topic_dict
y_rec_true,y_rec_predict = recommendation(df2,y_predict,df1,topic_dict,encoding['topic'])
d = {'y_rec_true':y_rec_true,'y_rec_predict':y_rec_predict}
# Dataframe of the recommendation topic and the true topic
df_rec = pd.DataFrame(d)

For topic ARTS CULTURE ENTERTAINMENT recommending article 9703,9830,9933,9952
For topic BIOGRAPHIES PERSONALITIES PEOPLE recommending article 9896,9526,9988,9940
For topic DEFENCE recommending article 9607,9770,9616,9670,9559,9759,9987,9576,9773
For topic DOMESTIC MARKETS recommending article 9640,9989
For topic FOREX MARKETS recommending article 9671,9565,9530,9529,9977,9551,9986,9682,9588,9798
For topic HEALTH recommending article 9982,9947,9810,9661,9873,9621,9929,9807,9833,9926
For topic MONEY MARKETS recommending article 9961,9808,9678,9766,9725,9555,9761,9853,9634,9586
For topic SHARE LISTINGS recommending article 9562,9666,9715,9601,9518
For topic SPORTS recommending article 9787,9992,9791,9754,9630,9752,9608,9513,9520,9800


In [111]:
print(classification_report(y_rec_true, y_rec_predict))

                                  precision    recall  f1-score   support

      ARTS CULTURE ENTERTAINMENT       0.50      1.00      0.67         2
BIOGRAPHIES PERSONALITIES PEOPLE       0.75      0.60      0.67         5
                         DEFENCE       0.78      1.00      0.88         7
                DOMESTIC MARKETS       0.00      0.00      0.00         0
                   FOREX MARKETS       0.60      0.60      0.60        10
                          HEALTH       0.80      1.00      0.89         8
                      IRRELEVANT       0.00      0.00      0.00        10
                   MONEY MARKETS       0.50      0.56      0.53         9
          SCIENCE AND TECHNOLOGY       0.00      0.00      0.00         1
                  SHARE LISTINGS       0.60      1.00      0.75         3
                          SPORTS       0.90      1.00      0.95         9

                        accuracy                           0.67        64
                       macro avg    

In [56]:
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(x_train,y_train)
print(grid.best_estimator_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=  52.2s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   52.2s remaining:    0.0s


[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=  52.3s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=  51.5s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ...................... C=0.1, gamma=1, kernel=poly, total= 1.4min
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ...................... C=0.1, gamma=1, kernel=poly, total= 1.4min
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ...................... C=0.1, gamma=1, kernel=poly, total= 1.4min
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] ................... C=0.1, gamma=1, kernel=sigmoid, total=  32.0s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] ................... C=0.1, gamma=1, kernel=sigmoid, total=  32.0s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] .

[CV] ..................... C=1, gamma=0.01, kernel=poly, total=  31.4s
[CV] C=1, gamma=0.01, kernel=sigmoid .................................
[CV] .................. C=1, gamma=0.01, kernel=sigmoid, total=  41.7s
[CV] C=1, gamma=0.01, kernel=sigmoid .................................
[CV] .................. C=1, gamma=0.01, kernel=sigmoid, total=  49.3s
[CV] C=1, gamma=0.01, kernel=sigmoid .................................
[CV] .................. C=1, gamma=0.01, kernel=sigmoid, total= 1.7min
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ..................... C=1, gamma=0.001, kernel=rbf, total= 8.8min
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ..................... C=1, gamma=0.001, kernel=rbf, total= 1.0min
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ..................... C=1, gamma=0.001, kernel=rbf, total=  44.9s
[CV] C=1, gamma=0.001, kernel=poly ...................................
[CV] .

[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=  33.8s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=  33.3s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=  34.2s
[CV] C=100, gamma=0.1, kernel=poly ...................................
[CV] .................... C=100, gamma=0.1, kernel=poly, total= 1.4min
[CV] C=100, gamma=0.1, kernel=poly ...................................
[CV] .................... C=100, gamma=0.1, kernel=poly, total= 1.4min
[CV] C=100, gamma=0.1, kernel=poly ...................................
[CV] .................... C=100, gamma=0.1, kernel=poly, total= 1.4min
[CV] C=100, gamma=0.1, kernel=sigmoid ................................
[CV] ................. C=100, gamma=0.1, kernel=sigmoid, total=  31.2s
[CV] C=100, gamma=0.1, kernel=sigmoid ................................
[CV] .

[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed: 132.3min finished


SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
