In [1]:
#Packages Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#Text Preprocessing
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords') #Downloading stopWords
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score

#ignore the warnings from sklearn
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/b0201655/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1. IMPORTING THE DATA 

In [2]:
df = pd.read_csv('/Users/b0201655/Downloads/archive/kindle_reviews.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 982619 entries, 0 to 982618
Data columns (total 10 columns):
Unnamed: 0        982619 non-null int64
asin              982619 non-null object
helpful           982619 non-null object
overall           982619 non-null int64
reviewText        982597 non-null object
reviewTime        982619 non-null object
reviewerID        982619 non-null object
reviewerName      978803 non-null object
summary           982618 non-null object
unixReviewTime    982619 non-null int64
dtypes: int64(3), object(7)
memory usage: 75.0+ MB


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


In [5]:
df.drop(df.columns[0], axis = 1, inplace = True)

#drop the rows where there are no reviews
df.dropna(subset = ['reviewText'], inplace = True)

#changing the reviewTime column to be of datetime type
df.reviewTime = pd.to_datetime(df.reviewTime)

#creating a column with just the year
# df['Year'] = df.reviewTime.dt.year
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,2014-05-05,A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,2014-01-06,AN0N05A9LIJEQ,critters,Different...,1388966400
2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,2014-04-04,A795DMNCJILA6,dot,Oldie,1396569600
3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,2014-02-19,A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...",2014-03-19,A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


In [6]:
reviews = df[['reviewText', 'overall']]
reviews.head()

Unnamed: 0,reviewText,overall
0,I enjoy vintage books and movies so I enjoyed ...,5
1,This book is a reissue of an old one; the auth...,4
2,This was a fairly interesting read. It had ol...,4
3,I'd never read any of the Amy Brewster mysteri...,5
4,"If you like period pieces - clothing, lingo, y...",4


# 2. Remove punctuations, special characters and stopwords from the text column. Convert the text to lower case. 

In [7]:
print('Actual Data :' + str(reviews['reviewText'][1]))
print('\n')

#Removing punctuations
punc = str.maketrans('', '', string.punctuation)

reviews['reviewText'] = reviews['reviewText'].apply(lambda x : ' '.join(word.translate(punc) for word in x.split()))
print('Data after removing punctuations: ' + str(reviews['reviewText'][1]))
print('\n')

#Removing non alpha character(Special Characters)
reviews['reviewText'] = reviews['reviewText'].apply(lambda x: ' '.join(word for word in x.split() if word.isalpha()))
print('Data after removing special Characters: '+ str(reviews['reviewText'][1]))
print('\n')

#Lemmatize words to reduce them to their root form. Note: added the pos = 'v' to reduce the incoming word to verb root
# nltk.download('wordnet')
# lem = WordNetLemmatizer()
# reviews['reviewText'] = reviews['reviewText'].apply(lambda x : ' '.join(lem.lemmatize(word, pos = 'v') for word in x.split()))
# print('Lemmatized Text: ' + str(reviews['reviewText'][1]))
# print('\n')

#Removing stopwords

stop = stopwords.words('english')
reviews['reviewText'] = reviews['reviewText'].apply(lambda x : ' '.join(word for word in x.split() if word not in stop))
print('Data after removing Stopwords: ' + str(reviews['reviewText'][1]))
print('\n')

#Converting the text to lower case. 
reviews['reviewText'] = reviews['reviewText'].apply(lambda x: ' '.join(word.lower() for word in x.split()))
print('Data after converting to lowercase : '+ str(reviews['reviewText'][1]))


Actual Data :This book is a reissue of an old one; the author was born in 1910. It's of the era of, say, Nero Wolfe. The introduction was quite interesting, explaining who the author was and why he's been forgotten; I'd never heard of him.The language is a little dated at times, like calling a gun a &#34;heater.&#34;  I also made good use of my Fire's dictionary to look up words like &#34;deshabille&#34; and &#34;Canarsie.&#34; Still, it was well worth a look-see.


Data after removing punctuations: This book is a reissue of an old one the author was born in 1910 Its of the era of say Nero Wolfe The introduction was quite interesting explaining who the author was and why hes been forgotten Id never heard of himThe language is a little dated at times like calling a gun a 34heater34 I also made good use of my Fires dictionary to look up words like 34deshabille34 and 34Canarsie34 Still it was well worth a looksee


Data after removing special Characters: This book is a reissue of an old o

# 3.Create two objects X and y. X will be the ' reviewText ' column dataframe and y will be the “Overall Rating” column. create a CountVectorizer object and split the data into training and testing sets. Train a MultinomialNB model and Display the confusion Matrix

In [8]:
#balancing the training data by downsampling to the minority class given there is a good amount of data
def make_xy(data, vec, n):
    
    temp = pd.DataFrame()
    #sampling only n class reviews per class
    for rating in range(2):
        temp = pd.concat([temp, data[data.overall == rating].sample(n, random_state = 42)], ignore_index = True)
    #vectorizing the vocabulary
    X = vec.fit_transform(temp.reviewText)
    y = temp.overall
    return X, y

In [9]:
#separating the ratings to different sentiment 
r1 = reviews[reviews.overall.isin([3,4,5])]
r0 = reviews[reviews.overall.isin([1,2])]
r1.loc[:, 'overall'] = 1
r0.loc[:, 'overall'] = 0

#concat the two new dataframes return one dataframe with preprocessed text and their corresponding labels
rev = pd.concat([r1,r0])
rev.head()
rev.overall.value_counts()


1    925449
0     57148
Name: overall, dtype: int64

In [10]:
#using CountVectorizer
count = CountVectorizer()
X, y = make_xy(rev, count, 20000)

In [11]:
#testing the model with CountVectorizer
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 42, stratify = y)

naive = MultinomialNB()
naive.fit(X_train, y_train)
print('Training Accuracy w/ CountVectorizer: {:.2f}'.format(naive.score(X_train, y_train)))
print('Testing Accuracy w/ CountVectorizer: {:.2f}'.format(naive.score(X_test, y_test)))

Training Accuracy w/ CountVectorizer: 0.92
Testing Accuracy w/ CountVectorizer: 0.88


In [13]:
nb_pred=naive.predict(X_test)
print(confusion_matrix(y_test, nb_pred))

[[5510  490]
 [ 956 5044]]


# 4. Display the POS tagging on the first 4 rows of ‘reviewText’ 

In [14]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
stop_words = set(stopwords.words('english')) 
for x in range(1, 5):
    reviewText=str(reviews['reviewText'][x])
    tokenized = sent_tokenize(reviewText) 
    for i in tokenized: 
      
        # Word tokenizers is used to find the words  
        # and punctuation in a string 
        wordsList = nltk.word_tokenize(i) 
  
        # removing stop words from wordList 
        wordsList = [w for w in wordsList if not w in stop_words]  
  
        #  Using a Tagger. Which is part-of-speech  
        # tagger or POS-tagger.  
        tagged = nltk.pos_tag(wordsList) 
  
        print('Review Text::'+ reviewText ) 
        
        print('POS Tagging::'+ str(tagged))
        print('\n')

[nltk_data] Downloading package punkt to /Users/b0201655/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/b0201655/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Review Text::this book reissue old one author born its era say nero wolfe the introduction quite interesting explaining author hes forgotten id never heard himthe language little dated times like calling gun i also made good use fires dictionary look words like still well worth looksee
POS Tagging::[('book', 'NN'), ('reissue', 'NN'), ('old', 'JJ'), ('one', 'CD'), ('author', 'NN'), ('born', 'VBN'), ('era', 'NNS'), ('say', 'VBP'), ('nero', 'JJ'), ('wolfe', 'JJ'), ('introduction', 'NN'), ('quite', 'RB'), ('interesting', 'JJ'), ('explaining', 'VBG'), ('author', 'NN'), ('hes', 'NNS'), ('forgotten', 'VBP'), ('id', 'JJ'), ('never', 'RB'), ('heard', 'VBP'), ('himthe', 'JJ'), ('language', 'NN'), ('little', 'RB'), ('dated', 'JJ'), ('times', 'NNS'), ('like', 'IN'), ('calling', 'VBG'), ('gun', 'NN'), ('also', 'RB'), ('made', 'VBD'), ('good', 'JJ'), ('use', 'NN'), ('fires', 'NNS'), ('dictionary', 'JJ'), ('look', 'VBP'), ('words', 'NNS'), ('like', 'IN'), ('still', 'RB'), ('well', 'RB'), ('worth', 'J

# 5.Build and display a dependency parser tree for the sentence

“When Jon Snow is stranded north of the Wall, half-frozen and under attack by wights, Benjen rides in and puts Jon on his horse."

In [15]:
def remove_punctuation(txt):
  txt_removepunct= "".join([ a for a in txt if a not in string.punctuation])
  return txt_removepunct

import spacy
from nltk import Tree

loadspacy = spacy.load('en')

st="When Jon Snow is stranded north of the Wall, half-frozen and under attack by wights, Benjen rides in and puts Jon on his horse."

spacydoc = loadspacy(remove_punctuation(st))

def dependencyParseTree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [dependencyParseTree(child) for child in node.children])
    else:
        return node.orth_

[dependencyParseTree(sent.root).pretty_print() for sent in spacydoc.sents]   

                        rides                                                     
   _______________________|______________________________________________          
  |     |   |                 stranded                                   |        
  |     |   |    ________________|_______                                |         
  |     |   |   |    |    |            north                             |        
  |     |   |   |    |    |              |                               |         
  |     |   |   |    |    |              of                              |        
  |     |   |   |    |    |       _______|_____________________          |         
  |     |   |   |    |    |      |               |           under      puts      
  |     |   |   |    |    |      |               |             |      ___|_____    
  |     |   |   |    |    |      |               |           attack  |         on 
  |     |   |   |    |    |      |               |             |     |         |  

[None]