In [121]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [133]:
#Reading the training and testing data
df1 = pd.read_csv('training.csv')
df2 = pd.read_csv('test.csv')

In [134]:
df1.head()

Unnamed: 0,article_number,article_words,topic
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",FOREX MARKETS
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",MONEY MARKETS
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",SPORTS
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",FOREX MARKETS
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",IRRELEVANT


In [135]:
df2.head()

Unnamed: 0,article_number,article_words,topic
0,9501,"world,complet,pharmaceut,tianjin,tianjin,chin,...",IRRELEVANT
1,9502,"copy,sunday,weekend,ec,friday,eu,includ,limit,...",IRRELEVANT
2,9503,"heavy,heavy,gabriel,morn,morn,equit,cent,cent,...",FOREX MARKETS
3,9504,"research,jess,hit,anticip,comput,comput,comput...",IRRELEVANT
4,9505,"provid,provid,luxembourg,court,court,case,opin...",IRRELEVANT


In [136]:
#One Hot Encoding to quantitatively represent the topics
encoding = {'topic' : 
            {'IRRELEVANT' : 0,
             'ARTS CULTURE ENTERTAINMENT':1,
             'BIOGRAPHIES PERSONALITIES PEOPLE':2,
             'DEFENCE' : 3, 
             'DOMESTIC MARKETS' : 4, 
             'FOREX MARKETS' : 5, 
             'HEALTH' : 6, 
             'MONEY MARKETS' : 7,
             'SCIENCE AND TECHNOLOGY' : 8, 
             'SHARE LISTINGS' : 9, 
             'SPORTS' :10,
             }}
encoding

{'topic': {'IRRELEVANT': 0,
  'ARTS CULTURE ENTERTAINMENT': 1,
  'BIOGRAPHIES PERSONALITIES PEOPLE': 2,
  'DEFENCE': 3,
  'DOMESTIC MARKETS': 4,
  'FOREX MARKETS': 5,
  'HEALTH': 6,
  'MONEY MARKETS': 7,
  'SCIENCE AND TECHNOLOGY': 8,
  'SHARE LISTINGS': 9,
  'SPORTS': 10}}

In [137]:
#Replacing topics with the relevant numbers for training data
df1 = df1.replace(encoding)
df1.head()

Unnamed: 0,article_number,article_words,topic
0,1,"open,absent,cent,cent,cent,stock,inflow,rate,k...",5
1,2,"morn,stead,end,end,day,day,day,patch,patch,pat...",7
2,3,"socc,socc,world,world,recent,law,fifa,fifa,fif...",10
3,4,"open,forint,forint,forint,forint,cent,cent,ste...",5
4,5,"morn,complet,weekend,minut,minut,minut,arrow,d...",0


In [138]:
#Replacing topics with the relevant numbers for testing data
df2 = df2.replace(encoding)
df2.head()

Unnamed: 0,article_number,article_words,topic
0,9501,"world,complet,pharmaceut,tianjin,tianjin,chin,...",0
1,9502,"copy,sunday,weekend,ec,friday,eu,includ,limit,...",0
2,9503,"heavy,heavy,gabriel,morn,morn,equit,cent,cent,...",5
3,9504,"research,jess,hit,anticip,comput,comput,comput...",0
4,9505,"provid,provid,luxembourg,court,court,case,opin...",0


In [139]:
df2.head(10)

Unnamed: 0,article_number,article_words,topic
0,9501,"world,complet,pharmaceut,tianjin,tianjin,chin,...",0
1,9502,"copy,sunday,weekend,ec,friday,eu,includ,limit,...",0
2,9503,"heavy,heavy,gabriel,morn,morn,equit,cent,cent,...",5
3,9504,"research,jess,hit,anticip,comput,comput,comput...",0
4,9505,"provid,provid,luxembourg,court,court,case,opin...",0
5,9506,"option,cent,cent,cent,narongchai,narongchai,re...",5
6,9507,"open,world,world,minist,minist,half,art,infras...",0
7,9508,"open,open,open,open,open,open,nick,world,ameri...",10
8,9509,"cent,figur,jan,jan,calcul,period,fall,end,mill...",0
9,9510,"ing,gmt,stock,stock,stock,pakistan,blu,blu,blu...",0


In [129]:
df2['topic'] = y_predict

Unnamed: 0,article_number,article_words,topic
0,9501,"world,complet,pharmaceut,tianjin,tianjin,chin,...",0
1,9502,"copy,sunday,weekend,ec,friday,eu,includ,limit,...",0
2,9503,"heavy,heavy,gabriel,morn,morn,equit,cent,cent,...",5
3,9504,"research,jess,hit,anticip,comput,comput,comput...",0
4,9505,"provid,provid,luxembourg,court,court,case,opin...",0
5,9506,"option,cent,cent,cent,narongchai,narongchai,re...",5
6,9507,"open,world,world,minist,minist,half,art,infras...",0
7,9508,"open,open,open,open,open,open,nick,world,ameri...",10
8,9509,"cent,figur,jan,jan,calcul,period,fall,end,mill...",0
9,9510,"ing,gmt,stock,stock,stock,pakistan,blu,blu,blu...",0


In [59]:
# Create bag of words
ngram_range = (1,2)
min_df = 0.    #When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
max_df = 1.   #max_df: When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
max_features = 1500
#count = TfidfVectorizer()
count= TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True,
                        use_idf=True)
bag_of_words = count.fit(df1['article_words'])

In [60]:
#Training and Testing split - X and Y
x_train = df1['article_words']
y_train = df1['topic'].to_list()
x_test = df2['article_words']
y_test = df2['topic'].to_list()

In [61]:
#Transforming the testing and training
x_train = bag_of_words.transform(x_train)
x_test = bag_of_words.transform(x_test)

In [62]:
from sklearn.svm import SVC 
classifier = SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
model = classifier.fit(x_train,y_train)

In [63]:
#Predicting the data
y_predict = model.predict(x_test)

In [64]:
train_accuracy_score = accuracy_score(y_train,model.predict(x_train))
test_accuracy_score = accuracy_score(y_test,model.predict(x_test))
print(f'Accuracy Score for training data : {train_accuracy_score}.\n')
print(f'Accuracy Score for testing data : {test_accuracy_score}.\n')

Accuracy Score for training data : 0.8671578947368421.

Accuracy Score for testing data : 0.758.



In [65]:
#Classification Report and Metrics Report
print(precision_score(y_test, y_predict,average = 'micro'))
print(recall_score(y_test, y_predict,average='micro'))
print(f1_score(y_test, y_predict, average='micro'))
print(f1_score(y_test, y_predict, average='macro'))
print(classification_report(y_test, y_predict))

0.758
0.758
0.7579999999999999
0.50630474960344
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       266
           1       0.50      0.67      0.57         3
           2       0.75      0.20      0.32        15
           3       0.78      0.54      0.64        13
           4       0.00      0.00      0.00         2
           5       0.50      0.31      0.38        48
           6       0.83      0.71      0.77        14
           7       0.49      0.67      0.56        69
           8       0.00      0.00      0.00         3
           9       0.60      0.43      0.50         7
          10       0.95      0.98      0.97        60

    accuracy                           0.76       500
   macro avg       0.57      0.49      0.51       500
weighted avg       0.75      0.76      0.75       500



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [21]:
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['linear']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(x_train,y_train)
print(grid.best_estimator_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .................... C=0.1, gamma=1, kernel=linear, total=  31.8s
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   31.8s remaining:    0.0s


[CV] .................... C=0.1, gamma=1, kernel=linear, total=  30.7s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .................... C=0.1, gamma=1, kernel=linear, total=  31.0s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] .................. C=0.1, gamma=0.1, kernel=linear, total=  30.7s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] .................. C=0.1, gamma=0.1, kernel=linear, total=  30.8s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV] .................. C=0.1, gamma=0.1, kernel=linear, total=  30.7s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] ................. C=0.1, gamma=0.01, kernel=linear, total=  30.8s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] ................. C=0.1, gamma=0.01, kernel=linear, total=  30.6s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 22.8min finished


SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [25]:
from sklearn.svm import SVC 
classifier = SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
model = classifier.fit(x_train,y_train)

In [26]:
#Predicting the data
y_predict = model.predict(x_test)

In [27]:
train_accuracy_score = accuracy_score(y_train,model.predict(x_train))
test_accuracy_score = accuracy_score(y_test,model.predict(x_test))
print(f'Accuracy Score for training data : {train_accuracy_score}.\n')
print(f'Accuracy Score for testing data : {test_accuracy_score}.\n')

Accuracy Score for training data : 0.8493684210526316.

Accuracy Score for testing data : 0.766.



In [28]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87       266
           1       1.00      0.33      0.50         3
           2       1.00      0.27      0.42        15
           3       1.00      0.62      0.76        13
           4       0.33      0.50      0.40         2
           5       0.47      0.33      0.39        48
           6       0.80      0.57      0.67        14
           7       0.52      0.64      0.57        69
           8       0.00      0.00      0.00         3
           9       0.50      0.43      0.46         7
          10       0.95      0.98      0.97        60

    accuracy                           0.77       500
   macro avg       0.67      0.51      0.55       500
weighted avg       0.77      0.77      0.76       500



## Cosine similarity

In [170]:
#ds = pd.read_csv("/home/nikita/Downloads/sample-data.csv")
def recommendation(test,y_predict,train_data,topic_dict):
    test_data = test.copy(deep=True)
    test_data['topic'] = y_predict
    topic_list = [i for i in range(11)]
    tf = TfidfVectorizer()
    model = tf.fit(train_data['article_words'])
    for t in topic_list:
        if not test_data[test_data['topic']==t].empty:        
            tfidf_train = model.transform(train_data[train_data['topic']==t]['article_words']).toarray()
            tfidf_test = model.transform(test_data[test_data['topic']==t]['article_words']).toarray()
            test_article_number = test_data[test_data['topic']==t]['article_number'].tolist()
            cosine = cosine_similarity(tfidf_test,tfidf_train)
            cosine = np.sort(cosine)
            suggested_list = np.argsort(cosine[:,-1])[-10:].tolist()
            article_number = ",".join([str(test_article_number[i]) for i in suggested_list])         
            print(f"For topic {topic_dict[t]} recommending article {article_number}")
            


In [169]:
topic_dict = {encoding['topic'][k] : k for k in encoding['topic']}
topic_dict
recommendation(df2,y_predict,df1,topic_dict)

For topic IRRELEVANT recommending article 9874,9951,9713,9655,9907,9955,9719,9642,9553,9716
For topic ARTS CULTURE ENTERTAINMENT recommending article 9703,9830,9933,9952
For topic BIOGRAPHIES PERSONALITIES PEOPLE recommending article 9896,9526,9988,9940
For topic DEFENCE recommending article 9607,9770,9616,9670,9559,9759,9987,9576,9773
For topic DOMESTIC MARKETS recommending article 9640,9989
For topic FOREX MARKETS recommending article 9671,9565,9530,9529,9977,9551,9986,9682,9588,9798
For topic HEALTH recommending article 9982,9947,9810,9661,9873,9621,9929,9807,9833,9926
For topic MONEY MARKETS recommending article 9961,9808,9678,9766,9725,9555,9761,9853,9634,9586
For topic SHARE LISTINGS recommending article 9562,9666,9715,9601,9518
For topic SPORTS recommending article 9787,9992,9791,9754,9630,9752,9608,9513,9520,9800


In [54]:
# from sklearn import svm
# from sklearn.model_selection import GridSearchCV
# def svc_param_selection(X, y, nfolds):
#     Cs = [0.001, 0.01, 0.1, 1, 10]
#     gammas = [0.001, 0.01, 0.1, 1]
#     param_grid = {'C': Cs, 'gamma' : gammas}
#     grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
#     grid_search.fit(X, y)
#     grid_search.best_params_
#     print(grid_search.best_params_)

In [56]:
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid.fit(x_train,y_train)
print(grid.best_estimator_)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=  52.2s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   52.2s remaining:    0.0s


[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=  52.3s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....................... C=0.1, gamma=1, kernel=rbf, total=  51.5s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ...................... C=0.1, gamma=1, kernel=poly, total= 1.4min
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ...................... C=0.1, gamma=1, kernel=poly, total= 1.4min
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ...................... C=0.1, gamma=1, kernel=poly, total= 1.4min
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] ................... C=0.1, gamma=1, kernel=sigmoid, total=  32.0s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] ................... C=0.1, gamma=1, kernel=sigmoid, total=  32.0s
[CV] C=0.1, gamma=1, kernel=sigmoid ..................................
[CV] .

[CV] ..................... C=1, gamma=0.01, kernel=poly, total=  31.4s
[CV] C=1, gamma=0.01, kernel=sigmoid .................................
[CV] .................. C=1, gamma=0.01, kernel=sigmoid, total=  41.7s
[CV] C=1, gamma=0.01, kernel=sigmoid .................................
[CV] .................. C=1, gamma=0.01, kernel=sigmoid, total=  49.3s
[CV] C=1, gamma=0.01, kernel=sigmoid .................................
[CV] .................. C=1, gamma=0.01, kernel=sigmoid, total= 1.7min
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ..................... C=1, gamma=0.001, kernel=rbf, total= 8.8min
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ..................... C=1, gamma=0.001, kernel=rbf, total= 1.0min
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ..................... C=1, gamma=0.001, kernel=rbf, total=  44.9s
[CV] C=1, gamma=0.001, kernel=poly ...................................
[CV] .

[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=  33.8s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=  33.3s
[CV] C=100, gamma=0.1, kernel=rbf ....................................
[CV] ..................... C=100, gamma=0.1, kernel=rbf, total=  34.2s
[CV] C=100, gamma=0.1, kernel=poly ...................................
[CV] .................... C=100, gamma=0.1, kernel=poly, total= 1.4min
[CV] C=100, gamma=0.1, kernel=poly ...................................
[CV] .................... C=100, gamma=0.1, kernel=poly, total= 1.4min
[CV] C=100, gamma=0.1, kernel=poly ...................................
[CV] .................... C=100, gamma=0.1, kernel=poly, total= 1.4min
[CV] C=100, gamma=0.1, kernel=sigmoid ................................
[CV] ................. C=100, gamma=0.1, kernel=sigmoid, total=  31.2s
[CV] C=100, gamma=0.1, kernel=sigmoid ................................
[CV] .

[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed: 132.3min finished


SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='sigmoid',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
