In [8]:
import pandas as pd
import numpy as np
from rt1 import * #Overriding is done using this.
import time
import operator
from sklearn.model_selection import *
import warnings
from sklearn.feature_extraction.text import *
from sklearn.naive_bayes import *

In [9]:
warnings.filterwarnings('ignore')

# Loading Data

We create an SGML Parser class that overrides Python's built in HTMLParser.

In [10]:
a = []
for i in range(10):
    filename = "reut2-00{}.sgm".format(i)
    parser = ReutersParser()
    doc = parser.parse(open(filename, 'rb'))
    a.append(list(doc))
for i in range(10):
    filename = "reut2-01{}.sgm".format(i)
    parser = ReutersParser()
    doc = parser.parse(open(filename, 'rb'))
    a.append(list(doc))
for i in range(2):
    filename = "reut2-02{}.sgm".format(i)
    parser = ReutersParser()
    doc = parser.parse(open(filename, 'rb'))
    a.append(list(doc))

In [11]:
f = open("all-topics-strings.lc.txt",'r')
topics = f.read().split('\n')
for i in range(135):
    topics[i] = "".join(topics[i].split())

# Removing Countries

In [12]:
ref_docs = []
for i in a: 
    for d in i:
            top = []
            for t in d[0]:
                if t in topics:
                    top.append(t)
            ref_docs.append((top,d[1]))

# Creating the Data Frame

There are 135 topics in total. Therefore, we use 135 models, where each model predicts whether the label (topic) can be assigned to the article. So our dataframe will have a column for the article and 135 extra columns which records '1' if the text is classified by the label denoting the column.

In [13]:
labels = []
text = []
for i in ref_docs:
        labels.append(i[0])
        text.append(i[1])

In [14]:
df = pd.DataFrame()
df['Label'] = labels
df['Text'] = text
df.shape

(21578, 2)

In [15]:
df = df[df.Text != '']

In [16]:
df.shape

(19043, 2)

In [17]:
df = df[df['Label'].map(lambda d: len(d)) > 0]

In [18]:
df = df.reset_index(drop=True)

In [19]:
df.shape

(10377, 2)

In [20]:
x=df['Text']
y=df['Label']
bin = np.zeros([10377,135])
for i in range(len(y)):
    for j in y[i]:
        bin[i][topics.index(j)] = 1

In [21]:
bindf = pd.DataFrame(bin)

In [22]:
dfmain = pd.concat([df,bindf], axis = 1)

In [23]:
dfmain

Unnamed: 0,Label,Text,0,1,2,3,4,5,6,7,...,125,126,127,128,129,130,131,132,133,134
0,[cocoa],Showers continued throughout the week in the B...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[grain, wheat, corn, barley, oat, sorghum]",The U.S. Agriculture Department reported the f...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,"[veg-oil, linseed, lin-oil, soy-oil, sun-oil, ...",Argentine grain board figures show crop regist...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,[earn],Champion Products Inc said its board of direct...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,[acq],Computer Terminal Systems Inc said it has comp...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,[earn],"Shr 34 cts vs 1.19 dlrs Net 807,000 vs 2,858,0...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"[earn, acq]","Ohio Mattress Co said its first quarter, endin...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,[earn],Oper shr loss two cts vs profit seven cts Oper...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,[earn],Shr one dlr vs 73 cts Net 12.6 mln vs 15.8 mln...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,[earn],Dean Foods Co expects earnings for the fourth ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train-Test Split

In [24]:
x=dfmain.drop('Label',axis = 1)
y=dfmain['Label']
testy = y
xtrain,xtest,ytrain,ytest = train_test_split(x,y,random_state=56)
print('Shape of x: {}'.format(x.shape))
print('Shapes of xtrain and ytrain: {},{}'.format(xtrain.shape,ytrain.shape))
print('Shapes of xtest and ytest: {},{}'.format(xtest.shape,ytest.shape))

Shape of x: (10377, 136)
Shapes of xtrain and ytrain: (7782, 136),(7782,)
Shapes of xtest and ytest: (2595, 136),(2595,)


# TFIDF

In [25]:
def tfidf(xtrain, xtest):
    tfidf_vect = TfidfVectorizer(analyzer='word', max_features=1500, stop_words = stop_words)
    tfidf_vect.fit(xtrain['Text'])
    xtrain_tfidf =  tfidf_vect.transform(xtrain['Text'])
    xtest_tfidf =  tfidf_vect.transform(xtest['Text'])
    return (xtrain_tfidf, xtest_tfidf)

In [26]:
(xtrain_tfidf, xtest_tfidf) = tfidf(xtrain, xtest)

# Training

In [32]:
def train(xtrain_tfidf, xtrain):
    t1 = time.time()
    model = {}
    for i in range(135):
        model[i] = MultinomialNB(alpha = 0)
        model[i].fit(xtrain_tfidf,xtrain[i])
    t2 = time.time()
    print('Time taken = {}'.format(t2-t1))
    return (model)

In [33]:
model = train(xtrain_tfidf, xtrain)

Time taken = 1.8194422721862793


# Testing

In [29]:
def test(xtest_tfidf, model):
    t1 = time.time()
    final = {}
    for i in range(135):
            final[i] = model[i].predict(xtest_tfidf)
    t2 = time.time()
    print('Time taken = {}'.format(t2-t1))
    return(final)

In [34]:
final = test(xtest_tfidf, model)

Time taken = 0.36673569679260254


### Accuracy (Jaccard Measure)

In [75]:
vali = []
for i,r in xtest.iterrows():
            vali.append((testy[i], i))

In [36]:
def ansacc(final, vali):
    ansans = []
    for j in range(len(vali)):    
        ans = []
        for i in final:
            if final[i][j] == 1:
                ans.append(topics[i])
        ansans.append(ans)
    a = 0
    for i in range(len(vali)):
        a += len(set(ansans[i]) & set(vali[i]))/len(set(ansans[i]) | set(vali[i]))
    a = a/len(vali)
    return(ansans, a) 

In [None]:
def ansacc1(final, vali, model1, xtest):
    ansans = []
    for j in range(len(vali)):    
        ans = []
        for i in final:
            if final[i][j] == 1:
                ans.append(topics[i])
        if ans == []:
            k = vali[j][1]
            ans.append(model1.predict(xtest['Text'][k]))
        ansans.append(ans)
    a = 0
    for i in range(len(vali)):
        a += len(set(ansans[i]) & set(vali[i][0]))/len(set(ansans[i]) | set(vali[i][0]))
    a = a/len(vali)
    return(ansans, a) 

In [77]:
(ansans,g) =  ansacc(final, vali)

In [78]:
g

0.7041300425115453

# K-Fold

In [37]:
def kfold(dfmain, k):
    x=dfmain.drop('Label',axis = 1)
    y=dfmain['Label']
    testy = y
    kf=KFold(k,shuffle=True)
    kf.get_n_splits(x)
    print(kf)
    for train_index, test_index in kf.split(x):
        print("TRAIN:", train_index, "TEST:", test_index)
        xtrain, xtest = x.iloc[train_index], x.iloc[test_index]
        ytrain, ytest = y.iloc[train_index], y.iloc[test_index]
        vali = []
        for i,r in xtest.iterrows():
            vali.append(testy[i])
        (xtrain_tfidf, xtest_tfidf) = tfidf(xtrain, xtest)
        model = train(xtrain_tfidf, xtrain)
        final = test(xtest_tfidf, model)
        (ansans,g) =  ansacc(final,vali)
        print(g)


In [38]:
kfold(dfmain, 2)

KFold(n_splits=2, random_state=None, shuffle=True)
TRAIN: [    1     4     5 ... 10371 10374 10375] TEST: [    0     2     3 ... 10372 10373 10376]
Time taken = 1.2084605693817139
Time taken = 0.6273882389068604
0.6766552177614041
TRAIN: [    0     2     3 ... 10372 10373 10376] TEST: [    1     4     5 ... 10371 10374 10375]
Time taken = 1.1946885585784912
Time taken = 0.629401683807373
0.6901009350026318


# Dealing with empty lists (Experimental)

In [80]:
labels = []
for i,r in dfmain.iterrows():
    labels.append(r['Label'][0])
df1 = dfmain[['Text','Label']]
df1 = pd.concat([df1,pd.DataFrame(labels)],axis = 1)
df1 = df1.drop('Label', axis = 1)

In [171]:
t = sorted(list(set(ytrain)))

In [173]:
len(t)

78

In [81]:
x=df1.drop(0,axis = 1)
y=df1[0]
testy = y
xtrain,xtest,ytrain,ytest = train_test_split(x,y,random_state=56)
print('Shape of x: {}'.format(x.shape))
print('Shapes of xtrain and ytrain: {},{}'.format(xtrain.shape,ytrain.shape))
print('Shapes of xtest and ytest: {},{}'.format(xtest.shape,ytest.shape))

Shape of x: (10377, 1)
Shapes of xtrain and ytrain: (7782, 1),(7782,)
Shapes of xtest and ytest: (2595, 1),(2595,)


In [82]:
(xtrain_tfidf, xtest_tfidf) = tfidf(xtrain, xtest)

In [83]:
model1 = MultinomialNB(alpha = 0)
model1.fit(xtrain_tfidf, ytrain)

MultinomialNB(alpha=0, class_prior=None, fit_prior=True)

In [109]:
s = []
for i in range(len(ansans)):
    if len(ansans[i]) == 0:
        s.append(i)


In [168]:
len(set(list(ytrain)))

78

In [112]:
tfidf_vect = TfidfVectorizer(analyzer='word', max_features=1500)
tfidf_vect.fit(xtrain['Text'])
n_tfidf =  tfidf_vect.transform(xtest.iloc[s]['Text'])

In [128]:
s1 = model1.predict_proba(n_tfidf)

In [213]:
s2 = []
for i in range(len(s1)):
    s3 = []
    for j in range(78):
        if s1[i][j] > 1/4:
            s3.append(t[j])
    s2.append(s3)

In [209]:
for i in range(len(s)):
    ansans[s[i]] = s2[i]

In [210]:
ansans

[['money-fx'],
 ['trade'],
 ['ship'],
 ['bop'],
 ['crude', 'trade'],
 ['acq'],
 ['earn'],
 ['acq'],
 ['crude'],
 ['grain'],
 ['earn'],
 ['earn'],
 ['crude', 'trade'],
 ['earn'],
 ['crude'],
 ['earn'],
 ['acq'],
 ['grain', 'wheat'],
 ['earn'],
 ['acq'],
 ['crude'],
 ['money-fx', 'trade'],
 ['acq'],
 ['earn'],
 ['trade'],
 ['acq'],
 ['earn'],
 ['earn'],
 ['earn'],
 ['trade'],
 ['cpi'],
 ['grain', 'wheat'],
 ['acq'],
 ['interest', 'money-fx'],
 ['crude'],
 ['earn'],
 ['earn'],
 ['grain'],
 ['crude'],
 ['acq'],
 ['acq'],
 ['crude'],
 ['acq'],
 ['earn'],
 ['dlr', 'money-fx', 'trade'],
 ['grain'],
 ['earn'],
 ['acq'],
 ['crude'],
 ['earn'],
 ['earn'],
 ['earn'],
 ['acq'],
 ['acq', 'crude'],
 ['earn'],
 ['earn'],
 ['interest', 'money-fx'],
 ['acq'],
 ['crude'],
 ['trade'],
 ['crude'],
 ['crude'],
 ['earn'],
 ['earn'],
 ['grain'],
 ['interest', 'money-fx'],
 ['earn'],
 ['earn'],
 ['lead', 'zinc'],
 ['acq'],
 ['bop', 'cpi', 'gnp', 'jobs'],
 ['crude'],
 ['acq'],
 ['grain', 'trade'],
 ['acq'],
 [

In [211]:
a = 0
for i in range(len(vali)):
        a += len(set(ansans[i]) & set(vali[i][0]))/len(set(ansans[i]) | set(vali[i][0]))
a = a/len(vali)

In [212]:
a

0.753450775300486

In [2]:
from stop_words import get_stop_words

stop_words = get_stop_words('en')
stop_words = get_stop_words('english')

In [6]:
stop_words

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 "can't",
 'cannot',
 'could',
 "couldn't",
 'did',
 "didn't",
 'do',
 'does',
 "doesn't",
 'doing',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 "hadn't",
 'has',
 "hasn't",
 'have',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 "here's",
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 "how's",
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 "let's",
 'me',
 'more',
 'most',
 "mustn't",
 'my',
 'myself',
 'no',
 'nor',
 'not',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'ought',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'same',
 "shan't",
 'she',
 "she'd",
 "she'll",
 "she's",
 'should',
 "s