In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import PunktSentenceTokenizer, sent_tokenize, word_tokenize
from nltk.probability import FreqDist
nltk.download('punkt')
nltk.download('maxent_treebank_pos_tagger')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

import pandas as pd
import numpy as np
import scipy 
import sklearn
import random
from pprint import pprint
import string

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.svm import SVC, NuSVC, LinearSVC

from sklearn import metrics






[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aubrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     C:\Users\Aubrey\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Aubrey\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aubrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def getData():
    
    data = pd.read_csv("BookReviews.csv", encoding = 'latin1')
    
    #without specifying "encoding" paramter, threw a "Unicode Decode Error"
    #help found at:
    #https://stackoverflow.com/questions/18171739/unicodedecodeerror-when-reading-csv-file-in-pandas-with-python
    
    
    data = data.sample(frac = 1).reset_index(drop = True)
    
    
    #df.sample() returns a random sample of the data in df. The kwarg frac specifies the fraction of the total df
    #     that you wish to sample. Then frac = 1 samples the entire df.
    #Then, df.reset_index() resets the index of the df, and the kwarg "drop = True" 
    #     prevents the old index being stored as a column in the df
    #****help found at:
    #https://stackoverflow.com/questions/29576430/shuffle-dataframe-rows
    
    data = data.loc[:, ~data.columns.str.contains('^Unnamed')]  #had several (about 4) columns titles "Unnamed"
    
    return data
    
    

In [3]:
def addPOS(df, ps):
    
    taggedTxt = []
    
    for i in df.index:
        
        txt = str(df.iloc[i])  #I had problems where this was a float, which doesn't make much sense
        txt = txt.lower()
        tggd = tag_txt(txt, ps)
        taggedTxt.append(tggd)
        
    column = pd.Series(taggedTxt, index = df.index)
    return column

In [4]:
def tag_txt(txt, ps):
    #is passed a string of raw text
    
    #this is just nltk's English Stop words with a couple things deleted, like "didn't", "couldn't" and their variations    
    custStopWords = ['to', 'further', 'ma', 'a', 'no',  'or', 'ours', 'once', 'before', 'out', "doesn't", 
                      've', 'm', "you've", "needn't", 'you', 'not', 'so', 'off', 'under', 'most', 'which', 
                      'more', 'ourselves', 'about', 'down', 'isn', 'they', 'his', "she's", 'only', 'how', 
                      'had', 'again', 'by', 'after', 'shan', 'their', 'some', "hasn't", 'mustn', 'yours', 
                      'is', 'who', 'we', 'because', "you'll", 'it', 'has', 'both', 'here', "don't", 'than', 
                      'through', 'any', 'did', 'its', 'own', 'being', 'all', 'yourself', 'needn', 'd', 'o', 
                      "weren't",  'itself', 'what', 're', 'my', 'there', 'ain', 'i', "isn't", "aren't", 'if', 
                      'll', 'wasn', 'of', 'your', 'an',  'over', 'wouldn', 'y', "mightn't", 'between', 'mightn',
                      "hadn't", 's', 'on', 'while', 'from', 'have', "shan't", 'then', "mustn't", 'will', 'below',
                      'where', 'been', 'same', 'don', 'myself', 'until', 'other', 'doesn', 'but', 'above', 'can', 
                      'for', 'and', 'against', "you'd", 'him', 'does', 'into', 'are', 'these', 'few', 'himself', 
                      'aren', "wasn't", 'at', 'too', "should've", 'should', 'those', "that'll", 'me', 'hasn', 'shouldn',
                      'themselves', 'weren', 'our', 'as', 'be', "it's", 'the', 'was', 'up', 'hadn', 'am',
                      'this', 'yourselves', 'that', "you're", 'having', 'each', 'do', 'she', 'them', 'very',
                      'nor', 'he', 'whom', 'now', 'won', 'during', 'her', 'hers', 'were', 'just', 'with', 
                      'why', "wouldn't", 'when', 'herself', "won't", "shouldn't", 'such', 'doing', 'in', 'theirs']
    
    trashPOStags = ['NNP', 'NNPS', 'PRP', 'PRP$', 'WP$', 'WP', 'WDT']

    taggedLst= []
    

    toked = PunktSentenceTokenizer().tokenize(txt)  #tokenized into sentences
    
    for s in toked: 
        
        words = word_tokenize(s)  #tokenize into words
        
        words = [words[i] for i in range(len(words)) if words[i] not in string.punctuation]
        
        words = nltk.pos_tag(words) #creates a list of (word, posTag) pairs
        
        words = [(ps.stem(word[0]), word[1]) for word in words if word[0] not in custStopWords] #stem the words.

        for word in words:  #word -> (word, pos)
            if not word[1] in trashPOStags: #if not a proper noun, etc
                taggedLst.append(word)
    

    return taggedLst

In [5]:
def getFeats(series):
    
    valList = []
    
    
    for i in series.index:
        sample = series.iloc[i]
        stemTxt = ""
        for w in sample:
            stemTxt += str(w[0])
            stemTxt += " "
        valList.append(stemTxt)
    stemTxtCol = pd.Series(valList)
    
    return stemTxtCol


In [6]:

reviews = getData()
ps = PorterStemmer()


In [7]:
reviews.loc[:, "pos_stem"] = addPOS(reviews.loc[:, 'text'], ps)
#addPOS() takes a Series, and returns a Series containing processed, tagged words as a new column of the df



In [8]:
reviews.loc[:, "stemTxtCol"] = getFeats(reviews.loc[:, 'pos_stem'])

In [9]:
txtList = [reviews.loc[:, "stemTxtCol"].iloc[i] 
           for i in reviews.loc[:, "stemTxtCol"].index]

piv = (len(txtList) // 4) * 3

trainTxt = txtList[:piv]
testTxt = txtList[piv:]

targArr = np.zeros(len(txtList))

for i in range(len(txtList)):
    if reviews.iloc[i].loc["label"] == "i":
        targArr[i] = 1
        
        
trTarg = targArr[:piv]
tsTarg = targArr[piv:]
        

    


# Decision Trees, SGD, Multinomial Naive Bayes

## Results have been promising  ^.^



In [None]:
dtParams =  {'vect__ngram_range': [(1, i+2) for i in range(4)],
            'clf__max_depth': [i+2 for i in range(15)]+[None],
            "clf__min_samples_leaf": [(i+1) for i in range(10)], 
            "clf__max_features": [(i+5) for i in range(10)]+[None]}



dt_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                     ('clf', 
                      DecisionTreeClassifier())])


randSearch = RandomizedSearchCV(estimator = dt_clf, 
                   param_distributions = dtParams,
                   n_iter = 100, cv = 3, verbose=2,
                   random_state=42, n_jobs = -1)
# Fit the random search model
randSearch.fit(trainTxt, trTarg)

best_random = randSearch.best_estimator_

_ = best_random.fit(trainTxt, trTarg)

predicted = best_random.predict(testTxt)


print("Best score from randSearch:")
print(randSearch.best_score_)
print()
print("Best params:")
print(randSearch.best_params_)
print()

print("classification report")
print(metrics.classification_report(tsTarg, predicted))
print()
print("confusion mtx")
print(metrics.confusion_matrix(tsTarg, predicted))


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   19.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   36.2s finished


Best score from randSearch:
0.782997762864

Best params:
{'vect__ngram_range': (1, 2), 'clf__min_samples_leaf': 3, 'clf__max_features': None, 'clf__max_depth': 2}

classification report
             precision    recall  f1-score   support

        0.0       0.85      0.99      0.91       112
        1.0       0.95      0.47      0.63        38

avg / total       0.87      0.86      0.84       150


confusion mtx
[[111   1]
 [ 20  18]]


In [None]:
sgdParams = {'vect__max_features': [i+5 for i in range(15)]+[None],
             'vect__ngram_range': [(1, i+2) for i in range(4)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3)}
sgd_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                     ('clf', 
                      SGDClassifier())])

randSearch = RandomizedSearchCV(estimator = sgd_clf, 
                   param_distributions = sgdParams,
                   n_iter = 100, cv = 3, verbose=2,
                   random_state=42, n_jobs = -1)
# Fit the random search model
randSearch.fit(trainTxt, trTarg)

best_random = randSearch.best_estimator_

_ = best_random.fit(trainTxt, trTarg)

predicted = best_random.predict(testTxt)
print("Best score from randSearch:")
print(randSearch.best_score_)
print()
print("Best params:")
print(randSearch.best_params_)
print()

print()
print("classification report")
print(metrics.classification_report(tsTarg, predicted))
print()
print("confusion mtx")
print(metrics.confusion_matrix(tsTarg, predicted))


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   16.1s


In [None]:
mnbParams = {"vect__ngram_range": [(1, 1), (1, 2), (1,3), (1,4)],
            'vect__max_features': [i+15 for i in range(10)],
            "tfidf__use_idf": (True, False),
            "clf__alpha": np.arange(0, 1.1, 0.1)}

mnb_clf = Pipeline([('vect', CountVectorizer()), 
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

randSearch = RandomizedSearchCV(estimator = mnb_clf, 
                   param_distributions = mnbParams,
                   n_iter = 100, cv = 3, verbose=2,
                   random_state=42, n_jobs = -1)
# Fit the random search model
randSearch.fit(trainTxt, trTarg)

best_random = randSearch.best_estimator_

_ = best_random.fit(trainTxt, trTarg)

predicted = best_random.predict(testTxt)


print(randSearch.best_score_)
print(randSearch.best_params_)
print()
print("classification report")
print(metrics.classification_report(tsTarg, predicted))
print()
print("confusion mtx")
print(metrics.confusion_matrix(tsTarg, predicted))



# Randofm

In [None]:


rf_clf = Pipeline([('vect', CountVectorizer()), 
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier())])

randSearch = RandomizedSearchCV(estimator = rf_clf, 
                   param_distributions = rfParams,
                   n_iter = 100, cv = 3, verbose=2,
                   random_state=42, n_jobs = -1)
# Fit the random search model
randSearch.fit(trainTxt, trTarg)

best_random = randSearch.best_estimator_

_ = best_random.fit(trainTxt, trTarg)

predicted = best_random.predict(testTxt)


print("Best score from randSearch:")
print(randSearch.best_score_)
print()
print("Best params:")
print(randSearch.best_params_)
print()

print("classification report")
print(metrics.classification_report(tsTarg, predicted))
print()
print("confusion mtx")
print(metrics.confusion_matrix(tsTarg, predicted))


# C-SVM Classification

## This ones weird. 

....yeah so its probably overfitted because of C. It might be worth keeping around and playing with the parameters. ¯\\__(ツ)__/¯  I grabbed this from [statsoft](url_hehttp://www.statsoft.com/textbook/support-vector-machinesre):

<img src = "csvm.png">




In [26]:
svcParams = {"vect__ngram_range": [(1, 1), (1, 2), (1,3), (1,4)],
            'vect__max_features': [i+15 for i in range(10)],
            "tfidf__use_idf": (True, False), 
            "clf__degree": [i+1 for i in range(4)]}

svc_clf = Pipeline([('vect', CountVectorizer()), 
                     ('tfidf', TfidfTransformer()),
                     ('clf', SVC())])


svc_clf = GridSearchCV(svc_clf, svcParams,
                      cv=5, iid=False, n_jobs=-1)
svc_clf = svc_clf.fit(trainTxt, trTarg)
print(svc_clf.best_score_)
print(svc_clf.best_params_) 

best = svc_clf.best_estimator_
_ = best.fit(trainTxt, trTarg)

predicted = best.predict(testTxt)
print()
print()
print(np.mean(predicted == tsTarg))
print()
print("classification report")
print(metrics.classification_report(tsTarg, predicted))
print()
print("confusion mtx")
print(metrics.confusion_matrix(tsTarg, predicted))


0.686750085121
{'clf__degree': 1, 'tfidf__use_idf': True, 'vect__max_features': 18, 'vect__ngram_range': (1, 1)}


0.733333333333

classification report
             precision    recall  f1-score   support

        0.0       0.71      1.00      0.83        99
        1.0       1.00      0.22      0.35        51

avg / total       0.81      0.73      0.67       150


confusion mtx
[[99  0]
 [40 11]]
