# Analysis with Grid Search

Using grid search to determine the best parameters for the count vectorizer, the selecting of the best features, and the classifier.

In [117]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import compress
from nltk import pos_tag
from nltk.corpus import wordnet, stopwords
from nltk.stem import snowball, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import make_pipeline, make_union, FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, f_classif
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load data
file_name = "Isla Vista - All Excerpts - 1_2_2019.xlsx"
data = pd.read_excel(file_name, sheet_name='Dedoose Excerpts Export')
print(data.shape)
data = data.dropna(axis=0)
print(data.shape)

(8131, 53)
(8127, 53)


In [3]:
excerpts = list(data['Excerpt'])
def stem_tokenizer(doc):
    tokens = word_tokenize(doc) 
    stemmer = snowball.SnowballStemmer("english")
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    list_tokens = [tok.lower() for tok in stemmed_tokens if tok.isalpha()]
    return(' '.join(list_tokens))
print("original: "+str(excerpts[3]))
print(stem_tokenizer(excerpts[3]))

original: A 22-year-old student last Friday killed six people and wounded 13 more in Isla Vista before turning his gun on himself. Commenters 
blamed the killer�s crimes on everything from misogynistic �pickup artist philosophy� to easy access to guns and no-fault divorce. Even 
�nerd culture� has come under scrutiny. 

Is American culture to blame for mass murder? 
a student last friday kill six peopl and wound more in isla vista befor turn his gun on himself comment blame the crime on everyth from misogynist artist to easi access to gun and divorc even has come under scrutini is american cultur to blame for mass murder


In [None]:
# stem + count
docs = [stem_tokenizer(doc) for doc in excerpts]
vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, 
                             stop_words=stopwords.words('english'), ngram_range=(1, 3))  
stem_count_X = vectorizer.fit_transform(docs).toarray() 
data_df = pd.DataFrame(stem_count_X)
data_df.columns = vectorizer.get_feature_names()

In [None]:
y = data.ACCOUNT
X_indices = np.arange(stem_count_X.shape[-1])
#selector = SelectPercentile(f_classif, percentile=0.1)
selector = SelectKBest(chi2, k=100)
X_new = selector.fit_transform(data_df, y)
best_feats = list(compress(data_df.columns, selector.get_support()))

In [99]:
# X is the list of raw docs
class TextCleaner(BaseEstimator, TransformerMixin):

    def stem_tokenizer(doc):
        tokens = word_tokenize(doc) 
        stemmer = snowball.SnowballStemmer("english")
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        list_tokens = [tok.lower() for tok in stemmed_tokens if tok.isalpha()]
        return(' '.join(list_tokens))

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_out= [stem_tokenizer(doc) for doc in X]
        return X_out
    

In [89]:
for excerpt in excerpts:
    doc = excerpt
    tokens = word_tokenize(doc) 
    tokens = [tok for tok in tokens if tok.isalnum() or tok in string.punctuation]
    tags = pos_tag(tokens)
    list_tags = [tag for tok, tag in tags]

In [100]:
class Pos(BaseEstimator, TransformerMixin):

    def pos_tagger(self, doc):
        tokens = word_tokenize(doc) 
        tokens = [tok for tok in tokens if tok.isalnum() or tok in string.punctuation]
        tags = pos_tag(tokens)
        list_tags = [tag for tok, tag in tags]
        return(' '.join(list_tags))

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_out= [self.pos_tagger(doc) for doc in X]
        return X_out

In [51]:
class WordCounter():

    def count(self, doc):
        tokens = word_tokenize(doc) 
        return(len(tokens))

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_out= [self.count(doc) for doc in X]
        return X_out

In [101]:
ngram_counts = Pipeline([
                ('prepro', TextCleaner()),
                ('vector', CountVectorizer(max_features=1500, min_df=5, max_df=0.7, 
                             stop_words=stopwords.words('english'), ngram_range=(1, 3)))
            ])

In [102]:
pos_counts = Pipeline([
                ('pos', Pos()),
                ('pos_vector', CountVectorizer(max_features=1500, max_df=0.9, ngram_range=(1, 5)))
            ])

In [52]:
num_words = Pipeline([('word_count', WordCounter())])

In [None]:
text_output = ngram_counts.fit_transform(excerpts)
pos_output = pos_counts.fit_transform(excerpts)
num_words_output = num_words.fit_transform(excerpts)

In [103]:
feats = FeatureUnion([('ngram_counts', ngram_counts), 
                      #('num_words', num_words),
                      ('pos_counts', pos_counts)])
                      
#feature_processing = Pipeline([('feats', feats)])
#feature_processing.fit_transform(excerpts)

In [106]:
x_train, x_test, y_train, y_test = \
            train_test_split(excerpts, data['ACCOUNT'].values, random_state=None)

In [None]:
pipeline = Pipeline([
    ('features',feats),
    ("selectk", SelectKBest(k=100, score_func=f_classif)),
    ('classifier', RandomForestClassifier(random_state = 42)),
])

pipe_output = pipeline.fit(x_train, y_train)

In [107]:
preds = pipeline.predict(x_test)
np.mean(preds == y_test)

0.9655511811023622

In [109]:
print(confusion_matrix(y_test, preds))  
print(classification_report(y_test, preds))  
print(accuracy_score(y_test, preds))

[[1567   18]
 [  52  395]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1585
           1       0.96      0.88      0.92       447

    accuracy                           0.97      2032
   macro avg       0.96      0.94      0.95      2032
weighted avg       0.97      0.97      0.97      2032

0.9655511811023622


In [114]:
pipeline = Pipeline([
    ('features',feats),
    ("selectk", SelectKBest(k=1000)),
    ('classifier', MultinomialNB()),
])

pipe_output = pipeline.fit(x_train, y_train)

preds = pipeline.predict(x_test)
np.mean(preds == y_test)

print(confusion_matrix(y_test, preds))  
print(classification_report(y_test, preds))  
print(accuracy_score(y_test, preds))

[[1174  411]
 [  67  380]]
              precision    recall  f1-score   support

           0       0.95      0.74      0.83      1585
           1       0.48      0.85      0.61       447

    accuracy                           0.76      2032
   macro avg       0.71      0.80      0.72      2032
weighted avg       0.84      0.76      0.78      2032

0.764763779527559


In [115]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'features', 'selectk', 'classifier', 'features__n_jobs', 'features__transformer_list', 'features__transformer_weights', 'features__verbose', 'features__ngram_counts', 'features__pos_counts', 'features__ngram_counts__memory', 'features__ngram_counts__steps', 'features__ngram_counts__verbose', 'features__ngram_counts__prepro', 'features__ngram_counts__vector', 'features__ngram_counts__vector__analyzer', 'features__ngram_counts__vector__binary', 'features__ngram_counts__vector__decode_error', 'features__ngram_counts__vector__dtype', 'features__ngram_counts__vector__encoding', 'features__ngram_counts__vector__input', 'features__ngram_counts__vector__lowercase', 'features__ngram_counts__vector__max_df', 'features__ngram_counts__vector__max_features', 'features__ngram_counts__vector__min_df', 'features__ngram_counts__vector__ngram_range', 'features__ngram_counts__vector__preprocessor', 'features__ngram_counts__vector__stop_words', 'features__ngram_cou

In [124]:
parameters = {
    # 'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'features__ngram_counts__vector__ngram_range': ((1, 1), (1, 3)),  # unigrams or trigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'selectk__score_func':(f_classif, chi2),
    'selectk__k':(100, 1000)
    #'clf__max_iter': (20,),
    #'clf__alpha': (0.00001, 0.000001),
    #'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}
grid_search = GridSearchCV(pipeline, parameters, cv=5,
                           n_jobs=1, verbose=1)

In [None]:
from time import time
t0 = time()
grid_search.fit(excerpts, data.ACCOUNT)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
