# Libraries to import

In [1]:
import nltk
import numpy as np
import scipy
import sklearn 
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn_deltatfidf import DeltaTfidfVectorizer
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
import random
from math import ceil, floor
from nltk.classify import ClassifierI
from statistics import mode, median, mean
from os import listdir
from os.path import isfile, join
import json
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
import pickle
import gensim

# Reading input training file

In [2]:
T_Data_Path = 'train_articles.json'
with  open(T_Data_Path) as File:
    Raw_T_Data = json.load(File)
    
Documents_D = Raw_T_Data['body']
Titles_D = Raw_T_Data['title']
Tags_D = Raw_T_Data['tags']
Tags_Vector = []
for Tag in Tags_D:
    for T in Tags_D[Tag]:
        if T not in Tags_Vector:
            Tags_Vector.append(T)
Documents = []
Titles = []
Tags = []
for Index in Tags_D:
    Documents.append(Documents_D[Index])
    Titles.append(Titles_D[Index])
    Tags.append(Tags_D[Index])
len(Documents), len(Titles), len(Tags)

(3426, 3426, 3426)

In [3]:
Tag_Convertor = {
    'Mongodb':'Database'
    , 'Database Development':'Database'
    , 'Sql':'Database'
    , 'MySQL':'Database'
    , 'Postgresql':'Database'
    , 'NoSQL':'Database'
    , 'Parenting':'Family'
    , 'Childhood':'Family'
    , 'Children':'Family'
    , 'Logo Design':'Graphic Design'
    , 'Neural Networks':'Deep Learning'
    , 'Startup Lessons': 'Entrepreneurship'
    , 'Entrepreneur': 'Entrepreneurship'
    , 'Jupyter': 'Python'
    , 'Application Security':'Information Security'
    , 'Ransomware':'Information Security'
    , 'Machine Learning': 'ML'
    , 'AndroidDev':'Android'
    , 'Android Architecture' : 'Android'
    , 'Android Studio' : 'Android'
    , 'Android Apps' : 'Android'
    , 'Web': 'Web Development'
    , 'Artificial Intelligence':'AI'
    , 'Open Source Software': 'Software'
    , 'Software Architecture': 'Software'
    , 'Software Development': 'Software'
    , 'Software Engineering': 'Software'
    , 'Software Testing': 'Software'
    , 'Career Change':'Career'
    , 'Career Advice':'Career'
    , 'Careers':'Career'
    , 'Ux Trends': 'UX'
    , 'Storytelling' : 'Writing'
    , 'Film': 'Movies'
    , 'Startups': 'Startup'
    , 'UI Design' : 'UI'
    , 'Humor' : 'Comedy'
    , 'Satire' : 'Comedy'
    , 'Humour' : 'Comedy'
    , 'Jokes' : 'Comedy'
    , 'Life Lessons' : 'Life'
    , 'Christmas Costume' : 'Christmas'
    , 'Design Digest' : 'Design'
}

In [4]:
Tag_Convertor

{'Android Apps': 'Android',
 'Android Architecture': 'Android',
 'Android Studio': 'Android',
 'AndroidDev': 'Android',
 'Application Security': 'Information Security',
 'Artificial Intelligence': 'AI',
 'Career Advice': 'Career',
 'Career Change': 'Career',
 'Careers': 'Career',
 'Childhood': 'Family',
 'Children': 'Family',
 'Christmas Costume': 'Christmas',
 'Database Development': 'Database',
 'Design Digest': 'Design',
 'Entrepreneur': 'Entrepreneurship',
 'Film': 'Movies',
 'Humor': 'Comedy',
 'Humour': 'Comedy',
 'Jokes': 'Comedy',
 'Jupyter': 'Python',
 'Life Lessons': 'Life',
 'Logo Design': 'Graphic Design',
 'Machine Learning': 'ML',
 'Mongodb': 'Database',
 'MySQL': 'Database',
 'Neural Networks': 'Deep Learning',
 'NoSQL': 'Database',
 'Open Source Software': 'Software',
 'Parenting': 'Family',
 'Postgresql': 'Database',
 'Ransomware': 'Information Security',
 'Satire': 'Comedy',
 'Software Architecture': 'Software',
 'Software Development': 'Software',
 'Software Engineer

# Filter documents that have no tags

In [5]:
NoTags = []
for i, TL in enumerate(Tags):
    if len(TL) == 0:
        NoTags.append(i)
Documents = np.delete(Documents, NoTags)
Titles = np.delete(Titles, NoTags)
Tags = np.delete(Tags, NoTags)

len(Documents), len(Titles), len(Tags)

(3218, 3218, 3218)

# Know more about classes frequency

In [6]:
TGS = {}
for TL in Tags:
    for Tag in TL:
        if Tag in TGS:
            TGS[Tag] += 1
        else:
            TGS[Tag] = 1
import operator            
TGS = list(reversed(sorted(TGS.items(), key=operator.itemgetter(1))))
len(TGS)

2998

# Use tag convertor to reduce the effect of rare tags

In [7]:
for i, TL in enumerate(Tags):
    for k, T in enumerate(TL):
        if T in Tag_Convertor:
            if Tag_Convertor[T] not in TL:
#                 print('Changed '+ T +' to ' + Tag_Convertor[T])
                Tags[i][k] = Tag_Convertor[T]
            else:
                del Tags[i][k]
                
                
# Revisulaize
TGS = {}
for TL in Tags:
    for Tag in TL:
        if Tag in TGS:
            TGS[Tag] += 1
        else:
            TGS[Tag] = 1
import operator            
TGS = list(reversed(sorted(TGS.items(), key=operator.itemgetter(1))))
len(TGS)

2966

# Label binarizer
## To convert tags to binary encoding

In [8]:
Label_Binarizer = preprocessing.MultiLabelBinarizer()
Label_Binarizer.fit(Tags)

Y = Label_Binarizer.transform(Tags)

In [9]:
Test_Train = 0
if Test_Train:
    X_Train, X_Test, Y_Train, Y_Test = train_test_split(Documents, Y, test_size= 0.1)
else:
    X_Train = Documents
    Y_Train = Y

# TF_IDF vectorizer with english stop words and nltk tokenization

In [10]:
TFiDFV = TfidfVectorizer(tokenizer=nltk.word_tokenize, stop_words=nltk.corpus.stopwords.words('english'))
TFiDFV = TFiDFV.fit(X_Train)
V_Train = TFiDFV.transform(X_Train)
# V_Test = TFiDFV.transform(X_Test)
V_Train.shape #, V_Test.shape

(3218, 86650)

# One-Vs-Rest classifier with SVM

In [11]:
# Warnings mean that certain classes has not been predicted true once in the whole training set (classes with so small frequency)
M_LSVC = OneVsRestClassifier(LinearSVC(tol=0.00001, class_weight='balanced'), n_jobs=2)
M_LSVC.fit(V_Train, Y_Train)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=1e-05,
     verbose=0),
          n_jobs=2)

# Test Predictions

In [79]:
Predictions = M_LSVC.predict(V_Test[276])
Label_Binarizer.inverse_transform(Predictions)

[('Design', 'UX', 'Writing')]

In [None]:
Label_Binarizer.inverse_transform(np.array([Y_Test[276]]))

In [None]:
predictions = M_LSVC.predict(V_Test)
Failed = 0
Saved = open('TestFileTok_Stop_Con.txt', 'w')
for p in predictions:
    st = str(Label_Binarizer.inverse_transform(np.array([p])))
    Saved.writelines([st, '\n'])
    if st == '[()]':
        Failed += 1
Saved.close()

In [57]:
Failed

0

# Title part

In [12]:
Ti_TFiDFV = TfidfVectorizer(tokenizer=nltk.word_tokenize, stop_words=nltk.corpus.stopwords.words('english'))
Ti_TFiDFV = Ti_TFiDFV.fit(Titles)
Ti_V_Train = Ti_TFiDFV.transform(Titles)

In [13]:
Ti_M_LSVC = OneVsRestClassifier(LinearSVC(tol=0.00001, class_weight='balanced'), n_jobs=2)
Ti_M_LSVC.fit(Ti_V_Train, Y)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=1e-05,
     verbose=0),
          n_jobs=2)

In [20]:
Test_Data_Path = 'test_articles.json'
with  open(Test_Data_Path) as File:
    Raw_Test_Data = json.load(File)
    
Test_Documents_D = Raw_Test_Data['body']
Test_Titles_D = Raw_Test_Data['title']

Test_Documents = list(Test_Documents_D.values())
Test_Titles = list(Test_Titles_D.values())

len(Test_Documents), len(Test_Titles)

V_Ti = Ti_TFiDFV.transform(Test_Titles)
Predictions = M_LSVC.predict(V_Ti)
Saved = open('Submission_Titles.txt', 'w')
Failed = 0
for p in Predictions:
    st = str(Label_Binarizer.inverse_transform(np.array([p])))
    Saved.writelines([st, '\n'])
    if st == '[()]':
        Failed += 1
Saved.close()
Failed

145

# Statistics

In [80]:
Y_Score = M_LSVC.decision_function(V_Test)

In [None]:
n_classes = 2955
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(Y_Test[:, i],
                                                        Y_Score[:, i])
    average_precision[i] = average_precision_score(Y_Test[:, i], Y_Score[:, i])

# A "micro-average": quantifying score on all classes jointly
precision["micro"], recall["micro"], _ = precision_recall_curve(Y_Test.ravel(),
    Y_Score.ravel())
average_precision["micro"] = average_precision_score(Y_Test, Y_Score,
                                                     average="micro")
print('Average precision score, micro-averaged over all classes: {0:0.2f}'
      .format(average_precision["micro"]))

In [None]:
plt.figure()
plt.step(recall['micro'], precision['micro'], color='b', alpha=0.2,
         where='post')
plt.fill_between(recall["micro"], precision["micro"], step='post', alpha=0.2,
                 color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(
    'Average precision score, micro-averaged over all classes: AP={0:0.2f}'
    .format(average_precision["micro"]))

# Save the Model

In [19]:
with open('M_LSVC_Titles_Balanced_.pkl', 'wb') as f:
    pickle.dump(M_LSVC, f)

# Read test data for submission

In [15]:
Test_Data_Path = 'test_articles.json'
with  open(Test_Data_Path) as File:
    Raw_Test_Data = json.load(File)
    
Test_Documents_D = Raw_Test_Data['body']
Test_Titles_D = Raw_Test_Data['title']

Test_Documents = list(Test_Documents_D.values())
Test_Titles = list(Test_Titles_D.values())

len(Test_Documents), len(Test_Titles)

V_T = TFiDFV.transform(Test_Documents)

In [16]:
Predictions = M_LSVC.predict(V_T)
Saved = open('Submission.txt', 'w')
Failed = 0
for p in Predictions:
    st = str(Label_Binarizer.inverse_transform(np.array([p])))
    Saved.writelines([st, '\n'])
    if st == '[()]':
        Failed += 1
Saved.close()
Failed

300

# Read the model and make predictions

In [17]:
Predictor = pickle.load(open('M_LSVC_Tok_Stop.pkl', 'rb'))
predictions = Predictor.predict(V_Test)
Saved = open('TestFileTok_Stop.txt', 'w')
for p in predictions:
    Saved.writelines([str(Label_Binarizer.inverse_transform(np.array([p]))), '\n'])
Saved.close()

# Final JSON Submission

In [24]:
Data_P = 'test_articles.json'
with  open(Data_P) as File:
    Data = json.load(File)
    
Docs = Data['body']
Titls = Data['title']

JSON_D = {}


for index in Docs:
    Doc = Docs[index]
    Titl = Titls[index]

    Doc_TF = TFiDFV.transform([Doc])
    Doc_O = M_LSVC.predict(Doc_TF)
    Titl_O = Ti_M_LSVC.predict(Ti_TFiDFV.transform([Titl]))
    
    Output = Doc_O * Titl_O
    
    
    Tgs = Label_Binarizer.inverse_transform(np.array(Output))
    
    if len(Tgs[0]) == 0:
        Tgs = Label_Binarizer.inverse_transform(np.array(Titl_O))
        
    Tgs_O = []
    for t in Tgs[0]:
        Tgs_O.append(t)
    
    JSON_D[index] = Tgs_O
    
    print(index)

json.dump(JSON_D, open('Submission.json', 'w'))

334
3979
1211
1329
60
3788
833
1587
1586
2490
1176
4238
1185
524
727
2169
510
2929
2100
1820
682
641
12
3532
234
1501
3668
2335
793
1270
2916
3450
4254
2476
3229
2481
4240
1460
4037
2623
816
2951
55
3821
3665
4493
3470
2586
3713
807
1815
4083
392
3417
3823
2222
4490
2746
1478
1419
3616
2084
3804
1374
3822
3565
3179
3533
458
1496
464
3817
2134
4145
653
3215
4436
3643
3352
2834
1850
2215
238
908
3173
4282
771
4360
831
946
2418
2310
939
964
1893
3784
3023
3570
3802
3824
3161
3897
417
2707
847
1000
3481
1651
1394
3953
3487
4336
4028
2282
2862
2446
1653
3926
482
845
310
2235
2800
3242
1017
4410
4412
860
3971
1174
3393
2049
2039
2936
4479
1873
1109
195
339
1175
862
3554
2011
223
82
3900
4175
4187
765
4071
1003
4301
3765
4514
461
1101
2665
973
429
1099
1567
2362
2499
2123
4101
1676
2230
2645
1152
4422
1722
3281
2983
3066
2942
1640
3221
638
811
294
315
718
3245
75
1519
4287
3629
657
2232
3247
2564
3626
4161
3077
3021
3209
702
1779
3401
4291
2817
4373
1532
2568
110
2935
465
2163
4292
80
2876
34

# Gensim Part

In [None]:
lda = gensim.models.ldamodel.LdaModel(corpus=gensim.corpora.textcorpus(Documents), num_topics=20)