In [3]:
import numpy as np
import resources.text_normalizer as tn
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

data_df = pd.read_csv('data/cleaned_all_data.csv')

In [4]:
data_df.head()

Unnamed: 0,category,text,clean text,category label
0,tech,tv future in the hands of viewers with home th...,tv future hand viewer home theatre system plas...,4
1,business,worldcom boss left books alone former worldc...,worldcom boss left book alone former worldcom ...,0
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary farrell gamble leicester say not r...,3
3,sport,yeading face newcastle in fa cup premiership s...,yeade face newcastle fa cup premiership side n...,3
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean twelve raid box office ocean twelve crim...,1


In [7]:
from sklearn.model_selection import train_test_split

train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names = train_test_split(
                                         np.array(data_df['clean text']),
                                         np.array(data_df['category label']),
                                         np.array(data_df['category']),
                                         test_size=0.2, random_state=42)
train_corpus.shape, test_corpus.shape

((9415,), (2354,))

In [8]:
from collections import Counter
trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))
(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd],
             columns=['category', 'Train Count', 'Test Count'])
.sort_values(by=['Train Count', 'Test Count'],
             ascending=False))

Unnamed: 0,category,Train Count,Test Count
3,tech,2728,687
1,entertainment,2127,522
0,politics,1967,504
4,business,1804,432
2,sport,789,209


### TD_IDF Model

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., norm="l2",
                     use_idf=True, smooth_idf=True)

tv_train_features = tv.fit_transform(train_corpus)
tv_test_features = tv.transform(test_corpus)


print('TF-IDF model:> Train features shape:', tv_train_features.shape,
      ' Test features shape:', tv_test_features.shape)

TF-IDF model:> Train features shape: (9415, 37423)  Test features shape: (2354, 37423)


In [10]:
tv_matrix = tv_train_features.toarray()
vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,aa,aaa,aaas,aac,aadc,aadhaar,aadhar,aaditya,aadmi,aag,...,zubair,zuckerberg,zuckerbergs,zuluaga,zurich,zuton,zutons,zvonareva,zvyagintsev,zynga
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9410,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### ML algorithms on TF-IDF model

In [12]:
import time
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

mnb = MultinomialNB(alpha=1)
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm_sgd = SGDClassifier(loss='hinge', penalty="l2", max_iter=5, random_state=42)
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)

scores_df = pd.DataFrame(columns = ['train_accuracy', 'test_accuracy', 'fit_time'])

models = [mnb, lr, svm, svm_sgd, rfc, gbc]
names = ['Multinomial Naive Bayes', 'Logistic Regression', 'Linear SVC',
         'SGD Classifier', 'Random Forest Classifier', 'Gradient Boosting Classifier']

for model, name in zip(models, names):
    temp_list = []
    print(name)

    model.fit(tv_train_features, train_label_names)
    scores = cross_validate(model, tv_train_features, train_label_names,
                            scoring=('accuracy'),
                            return_train_score=True, cv=10)

    for score in ['accuracy']:
        mean_score = scores['train_score'].mean()
        print('train {} mean : {}'.format(score, mean_score))
        temp_list.append(mean_score)

    test_score = model.score(tv_test_features, test_label_names)
    temp_list.append(test_score)
    print('test accuracy mean: {}'.format(test_score))

    temp_list.append(scores['fit_time'].mean())
    print('average fit time: {} \n'.format(scores['fit_time'].mean()))
    scores_df.loc[name] = temp_list


Multinomial Naive Bayes
train accuracy mean : 0.9740130967876665
test accuracy mean: 0.9570943075615973
average fit time: 0.02502882480621338 

Logistic Regression
train accuracy mean : 0.990983640965244
test accuracy mean: 0.9728122344944775
average fit time: 3.24036021232605 

Linear SVC
train accuracy mean : 0.9993863248462729
test accuracy mean: 0.9864061172472387
average fit time: 0.18519136905670167 

SGD Classifier
train accuracy mean : 0.9968843959519781
test accuracy mean: 0.9830076465590484
average fit time: 0.0542572021484375 

Random Forest Classifier
train accuracy mean : 0.9986310024587395
test accuracy mean: 0.9307561597281223
average fit time: 1.0121280670166015 

Gradient Boosting Classifier
train accuracy mean : 0.836018277497325
test accuracy mean: 0.822429906542056
average fit time: 11.64024715423584 



In [14]:
scores_df

Unnamed: 0,train_accuracy,test_accuracy,fit_time
Multinomial Naive Bayes,0.974013,0.957094,0.025029
Logistic Regression,0.990984,0.972812,3.24036
Linear SVC,0.999386,0.986406,0.185191
SGD Classifier,0.996884,0.983008,0.054257
Random Forest Classifier,0.998631,0.930756,1.012128
Gradient Boosting Classifier,0.836018,0.82243,11.640247


### try TD-IDF with n-grams

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., norm="l2",
                     use_idf=True, smooth_idf=True, ngram_range = (1,2))

tv_train_features = tv.fit_transform(train_corpus)
tv_test_features = tv.transform(test_corpus)


print('TF-IDF model:> Train features shape:', tv_train_features.shape,
      ' Test features shape:', tv_test_features.shape)

TF-IDF model:> Train features shape: (9415, 556637)  Test features shape: (2354, 556637)


In [16]:
tv_matrix = tv_train_features.toarray()
vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,aa,aa gaya,aa grade,aa motoring,aaa,aaa background,aaa battery,aaa champion,aaa championship,aaa championships,...,zutons estelle,zvonareva,zvonareva lose,zvonareva russia,zvonareva struggle,zvonareva wimbledon,zvyagintsev,zvyagintsev return,zynga,zynga game
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9410,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
vocab[50000:55000]

['benz also',
 'benz audi',
 'benz bmw',
 'benz cla',
 'benz concept',
 'benz michael',
 'benz research',
 'benz sale',
 'benz say',
 'benz strategy',
 'benz title',
 'benzs',
 'benzs woe',
 'beoplay',
 'beoplay notable',
 'beosound',
 'beosound edge',
 'bequest',
 'bequest ruth',
 'bera',
 'bera write',
 'berate',
 'berate team',
 'berates',
 'berates whingee',
 'berbatov',
 'berbatov weak',
 'berbatovs',
 'berbatovs weak',
 'berdych',
 'berdych olympic',
 'bereft',
 'bereft idea',
 'bereft pre',
 'beresford',
 'beresford say',
 'beresheet',
 'beresheet attempt',
 'beresheet capture',
 'beresheet eventually',
 'beresheet fire',
 'beresheet first',
 'beresheet gravity',
 'beresheet hebrew',
 'beresheet much',
 'beresheet not',
 'beresheet use',
 'beresheets',
 'beresheets israeli',
 'beresheets orbit',
 'bergamasco',
 'bergamasco canale',
 'bergamasco david',
 'bergamasco flanker',
 'bergamasco head',
 'bergamasco important',
 'bergamasco parisse',
 'bergamasco pozzebon',
 'bergamasco 

In [18]:
for model, name in zip(models, names):
    temp_list = []
    print(name)

    model.fit(tv_train_features, train_label_names)
    scores = cross_validate(model, tv_train_features, train_label_names,
                            scoring=('accuracy'),
                            return_train_score=True, cv=10)

    for score in ['accuracy']:
        mean_score = scores['train_score'].mean()
        print('train {} mean : {}'.format(score, mean_score))
        temp_list.append(mean_score)

    test_score = model.score(tv_test_features, test_label_names)
    temp_list.append(test_score)
    print('test accuracy mean: {}'.format(test_score))

    temp_list.append(scores['fit_time'].mean())
    print('average fit time: {} \n'.format(scores['fit_time'].mean()))
    scores_df.loc[name] = temp_list

Multinomial Naive Bayes
train accuracy mean : 0.9860270253835777
test accuracy mean: 0.9418011894647409
average fit time: 0.10768909454345703 

Logistic Regression
train accuracy mean : 0.9948309393335689
test accuracy mean: 0.9732370433305013
average fit time: 44.57018132209778 

Linear SVC
train accuracy mean : 0.9997875730210343
test accuracy mean: 0.983857264231096
average fit time: 1.0458128690719604 

SGD Classifier
train accuracy mean : 0.9991856958842538
test accuracy mean: 0.9825828377230247
average fit time: 0.19658045768737792 

Random Forest Classifier
train accuracy mean : 0.9980291364639893
test accuracy mean: 0.9061172472387425
average fit time: 7.006092596054077 

Gradient Boosting Classifier
train accuracy mean : 0.8386734580496027
test accuracy mean: 0.8241291418861513
average fit time: 152.92056527137757 



In [19]:
scores_df

Unnamed: 0,train_accuracy,test_accuracy,fit_time
Multinomial Naive Bayes,0.986027,0.941801,0.107689
Logistic Regression,0.994831,0.973237,44.570181
Linear SVC,0.999788,0.983857,1.045813
SGD Classifier,0.999186,0.982583,0.19658
Random Forest Classifier,0.998029,0.906117,7.006093
Gradient Boosting Classifier,0.838673,0.824129,152.920565
