# Importing packages

In [5]:
import pandas as pd
from sklearn import preprocessing

In [6]:
df = pd.read_csv('ds_cat_head_descr.csv')

# Encoding categories

In [7]:
leCat = preprocessing.LabelEncoder()
df['category_enc']=leCat.fit_transform(df.category)

In [8]:
df.head()

Unnamed: 0,category,head_descr,category_enc
0,CRIME,"there were 2 mass shootings in teas last week,...",6
1,ENTERTAINMENT,hugh grant marries for the first time at age 5...,10
2,ENTERTAINMENT,jim carrey blasts 'castrato' adam schiff and d...,10
3,ENTERTAINMENT,julianna margulies uses donald trump poop bags...,10
4,ENTERTAINMENT,morgan freeman 'devastated' that seual harassm...,10


# Spliting dataset

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.head_descr, df.category_enc, test_size=0.20,
                                                    random_state=20)

In [10]:
y_train.shape

(119185,)

# Extracting features from text files

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(119185, 70631)

# TF-IDF

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(119185, 70631)

# Machine Learning
## Training Naive Bayes (NB) classifier on training data.

In [27]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [28]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.
from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(X_train, y_train)

In [38]:
# Performance of NB Classifier
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
predicted = text_clf.predict(X_test)
#print(np.mean(predicted == y_test))
print("Accuracy NB: "+str(round(accuracy_score(y_test, predicted),2)*100)+"%")
print(classification_report(y_test, predicted))

Accuracy NB: 38.0%
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       180
           1       0.00      0.00      0.00       268
           2       0.00      0.00      0.00       647
           3       1.00      0.00      0.00       849
           4       0.00      0.00      0.00       183
           5       0.00      0.00      0.00       760
           6       0.00      0.00      0.00       396
           7       0.00      0.00      0.00       139
           8       0.00      0.00      0.00       306
           9       0.00      0.00      0.00       172
          10       0.54      0.67      0.60      2415
          11       0.00      0.00      0.00       115
          12       0.00      0.00      0.00       177
          13       0.68      0.24      0.36       851
          14       0.00      0.00      0.00       183
          15       0.00      0.00      0.00       342
          16       1.00      0.00      0.00      1017
        

# Training Support Vector Machines - SVM and calculating its performance

In [39]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, 
                                                   random_state=42))])

text_clf_svm = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
#np.mean(predicted_svm == y_test)

In [41]:
print("Accuracy SVM: "+str(round(accuracy_score(y_test, predicted_svm),2)*100)+"%")
print(classification_report(y_test, predicted_svm))

Accuracy SVM: 55.00000000000001%
              precision    recall  f1-score   support

           0       0.29      0.12      0.17       180
           1       0.43      0.18      0.26       268
           2       0.52      0.23      0.32       647
           3       0.54      0.31      0.39       849
           4       0.43      0.19      0.27       183
           5       0.44      0.15      0.22       760
           6       0.54      0.35      0.43       396
           7       0.51      0.17      0.26       139
           8       0.61      0.66      0.63       306
           9       0.53      0.18      0.27       172
          10       0.59      0.68      0.64      2415
          11       0.28      0.10      0.14       115
          12       0.33      0.10      0.16       177
          13       0.57      0.65      0.61       851
          14       0.20      0.09      0.13       183
          15       0.37      0.14      0.20       342
          16       0.38      0.11      0.17     

# Grid Search

In [42]:
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3)}

In [43]:
# Next, we create an instance of the grid search by passing the classifier, parameters 
# and n_jobs=-1 which tells to use multiple cores from user machine.

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [44]:
# To see the best mean score and the params, run the following code

gs_clf.best_score_
gs_clf.best_params_

# Output for above should be: The accuracy has now increased to ~90.6% for the NB classifier (not so naive anymore! 😄)
# and the corresponding parameters are {‘clf__alpha’: 0.01, ‘tfidf__use_idf’: True, ‘vect__ngram_range’: (1, 2)}.

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [50]:
predicted_cv_clf = gs_clf.predict(X_test)
print("Accuracy CV clf: "+str(round(accuracy_score(y_test, predicted_cv_clf),2)*100)+"%")
print(classification_report(y_test, predicted_cv_clf))

Accuracy CV clf: 56.00000000000001%
              precision    recall  f1-score   support

           0       0.36      0.14      0.21       180
           1       0.54      0.16      0.24       268
           2       0.51      0.28      0.36       647
           3       0.49      0.37      0.42       849
           4       0.57      0.17      0.27       183
           5       0.54      0.30      0.38       760
           6       0.54      0.52      0.53       396
           7       0.56      0.17      0.26       139
           8       0.68      0.42      0.52       306
           9       0.43      0.14      0.21       172
          10       0.60      0.73      0.66      2415
          11       0.59      0.14      0.23       115
          12       0.21      0.08      0.12       177
          13       0.60      0.62      0.61       851
          14       0.37      0.11      0.17       183
          15       0.44      0.29      0.35       342
          16       0.37      0.19      0.25  

## Similarly doing grid search for SVM

In [46]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),
                  'clf-svm__alpha': (1e-2, 1e-3)}

gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train, y_train)


gs_clf_svm.best_score_
gs_clf_svm.best_params_



{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [51]:
predicted_clf_svm = gs_clf_svm.predict(X_test)
print("Accuracy clf SVM: "+str(round(accuracy_score(y_test, predicted_clf_svm),2)*100)+"%")
print(classification_report(y_test, predicted_clf_svm))

Accuracy clf SVM: 56.00000000000001%
              precision    recall  f1-score   support

           0       0.41      0.13      0.19       180
           1       0.58      0.18      0.28       268
           2       0.56      0.27      0.36       647
           3       0.52      0.31      0.39       849
           4       0.46      0.17      0.25       183
           5       0.57      0.17      0.26       760
           6       0.54      0.40      0.46       396
           7       0.91      0.14      0.25       139
           8       0.67      0.67      0.67       306
           9       0.51      0.21      0.30       172
          10       0.63      0.70      0.66      2415
          11       0.52      0.10      0.17       115
          12       0.21      0.05      0.07       177
          13       0.59      0.65      0.62       851
          14       0.44      0.10      0.16       183
          15       0.50      0.19      0.27       342
          16       0.48      0.13      0.20 

# NLTK
## Removing stop words

In [47]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])

In [48]:
# Stemming Code

import nltk
#nltk.download()

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), ('tfidf', TfidfTransformer()), 
                             ('mnb', MultinomialNB(fit_prior=False))])

text_mnb_stemmed = text_mnb_stemmed.fit(X_train, y_train)

predicted_mnb_stemmed = text_mnb_stemmed.predict(X_test)

np.mean(predicted_mnb_stemmed == y_test)

0.494345068295466

In [49]:
print("Accuracy MNB: "+str(round(accuracy_score(y_test, predicted_mnb_stemmed),2)*100)+"%")
print(classification_report(y_test, predicted_mnb_stemmed))

Accuracy MNB: 49.0%
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       180
           1       1.00      0.02      0.04       268
           2       0.68      0.06      0.11       647
           3       0.52      0.18      0.27       849
           4       0.00      0.00      0.00       183
           5       0.74      0.15      0.24       760
           6       0.71      0.16      0.27       396
           7       1.00      0.07      0.13       139
           8       0.88      0.15      0.25       306
           9       0.00      0.00      0.00       172
          10       0.51      0.80      0.62      2415
          11       0.00      0.00      0.00       115
          12       0.00      0.00      0.00       177
          13       0.59      0.63      0.61       851
          14       1.00      0.01      0.02       183
          15       0.45      0.04      0.07       342
          16       0.81      0.07      0.12      1017
       

  _warn_prf(average, modifier, msg_start, len(result))
