# DS-SF-27 | Codealong 16 | Introduction to Natural Language Processing

## >>> One-time setup

In [1]:
'''
import nltk
nltk.download()
'''

pass

## <<< One-time setup

In [2]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import string
import unicodedata
from nltk import tokenize, corpus, stem

from sklearn import feature_extraction, linear_model, ensemble, cross_validation, metrics, decomposition

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

## Tokenization

In [3]:
def tokenize_text(document):
    document = document.encode('utf-8')

    # Convert text to lowercase
    document = document.lower()

    # Tokenize
    tokens = tokenize.word_tokenize(document)

    # Remove punctuation in tokens and then remove empty tokens
    tokens = [token.translate(None, string.punctuation) for token in tokens]
    tokens = [token for token in tokens if token]

    # Remove stop words
    tokens = [token for token in tokens if not token in corpus.stopwords.words('english')]

    return tokens

In [4]:
tokens = tokenize_text("This is a sentence...  Wait, here's another.  And a third!")

tokens

['sentence', 'wait', 'another', 'third']

## Stemming

In [5]:
class Stemmer:
    stemmer = stem.porter.PorterStemmer()

    @staticmethod
    def stem_tokens(tokens):
        return [Stemmer.stemmer.stem(token) for token in tokens]

In [6]:
tokens = Stemmer.stem_tokens(tokens)

tokens

[u'sentenc', u'wait', u'anoth', u'third']

## Book reviews

Below, we will be analyzing a partial list of the reviews for J.K. Rowling's The Casual Vacancy.  (https://www.amazon.com/dp/0316228532)

Our dataset is a subset of http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Books_5.json.gz.

In [7]:
df = pd.read_csv(os.path.join('..', 'datasets', 'reviews_Books_5-0316228532.csv'))

In [8]:
df

Unnamed: 0,asin,reviewer_id,reviewer_name,summary,review_text,overall,review_time,unix_review_time,helpful
0,316228532,AY2UIGHCB4VPB,,but a good read!,"A departure for her, but a good read!",5,"07 12, 2014",1405123200,"[0, 0]"
1,316228532,A2L17U0TWH9UWS,1075,Not worth the time,I had a hard time remembering who each charact...,2,"11 12, 2013",1384214400,"[0, 1]"
2,316228532,A2R63TBVG5OAF6,12121,The Casual Vacancy,This is the only review I have ever written. ...,1,"10 1, 2012",1349049600,"[13, 25]"
3,316228532,ACU39L9G696US,123esmo,Expecting more from J.K. Rowling,"I was expecting more from J.K. Rowling, it's a...",2,"01 10, 2013",1357776000,"[0, 1]"
4,316228532,A3N7KY1PBMF880,&#34;Bad Cat!&#34;,Sorry That I Bought It.,As big a fan as I am of J K Rowling's Harry Po...,1,"05 11, 2013",1368230400,"[0, 3]"
...,...,...,...,...,...,...,...,...,...
2045,316228532,A1SCYWLS37YR50,ZC,Spectacular prose in a rambling story,Spectacular prose in a rambling story that see...,5,"02 12, 2014",1392163200,"[1, 1]"
2046,316228532,A1POFVVXUZR3IQ,Z Hayes,"Difficult to get into, but has its moments",Although I am a great fan of the Harry Potter ...,3,"07 18, 2013",1374105600,"[1, 1]"
2047,316228532,A1YSU2VSUJZAR5,zolteg59,The Casual Vacancy,"While the story was intriguing, and I am a hug...",1,"11 11, 2012",1352592000,"[0, 1]"
2048,316228532,A2ZF888HX9YR8E,Zoobeefoo,A better read for Brits perhaps?,What an odd book! The adolescent characters a...,3,"12 30, 2012",1356825600,"[2, 3]"


In [9]:
df.drop(['asin', 'reviewer_id', 'reviewer_name', 'summary', 'review_time', 'unix_review_time', 'helpful'],
    axis = 1,
    inplace = True)

In [10]:
df

Unnamed: 0,review_text,overall
0,"A departure for her, but a good read!",5
1,I had a hard time remembering who each charact...,2
2,This is the only review I have ever written. ...,1
3,"I was expecting more from J.K. Rowling, it's a...",2
4,As big a fan as I am of J K Rowling's Harry Po...,1
...,...,...
2045,Spectacular prose in a rambling story that see...,5
2046,Although I am a great fan of the Harry Potter ...,3
2047,"While the story was intriguing, and I am a hug...",1
2048,What an odd book! The adolescent characters a...,3


In [11]:
df.overall.value_counts(dropna = False)

4    464
5    457
3    397
2    373
1    359
Name: overall, dtype: int64

In [12]:
# TODO
# check empty string
df.isnull().sum()

review_text    0
overall        0
dtype: int64

In [13]:
#define feature matrix
X = df.review_text
c = df.overall

In [14]:
#todo
c

0       5
1       2
2       1
3       2
4       1
       ..
2045    5
2046    3
2047    1
2048    3
2049    5
Name: overall, dtype: int64

In [15]:
#to do
X

0                   A departure for her, but a good read!
1       I had a hard time remembering who each charact...
2       This is the only review I have ever written.  ...
3       I was expecting more from J.K. Rowling, it's a...
4       As big a fan as I am of J K Rowling's Harry Po...
                              ...                        
2045    Spectacular prose in a rambling story that see...
2046    Although I am a great fan of the Harry Potter ...
2047    While the story was intriguing, and I am a hug...
2048    What an odd book!  The adolescent characters a...
2049    Chatty and immediately comfortable to read. It...
Name: review_text, dtype: object

## Train/test sets

In [16]:
train_X, test_X, train_c, test_c = cross_validation.train_test_split(X, c, train_size = .6, random_state = 0)

In [17]:
train_X

970     Don't let anyone kid you: there's magic in J.K...
494     JK Rowling can write, no one doubts that.  But...
580     Just because everything has been said about a ...
1038    I was initially intruiged by the summary on th...
1185    An excellent story line. Well written with goo...
                              ...                        
1383    I loved the writing in Harry Potter and I am n...
1731    Even though I knew this would be nothing like ...
763     It took me a while to start to get the large c...
835     Try to think of this is a &#34;first novel,&#3...
1653    (yes, I am a Harry Potter fan)I found the pola...
Name: review_text, dtype: object

## TF-IDF and `TfidfVectorizer`

In [18]:
# TODO
# similar to scaler
'''
vectorizer = feature_extraction.text.TfidfVectorizer(stop_words = 'english')
'''

class CustomTokenizer(object):
    def __init__(self):
        self.stemmer = stem.porter.PorterStemmer()

    def __call__(self, document):
        tokens = tokenize_text(document)
        tokens = Stemmer.stem_tokens(tokens)
        return tokens

vectorizer = feature_extraction.text.TfidfVectorizer(tokenizer = CustomTokenizer(), ngram_range = (1, 3), min_df = 3)

In [19]:
vectorizer.fit(train_X)

pass
# going to create collumn for each
# fit train the data

## Bag-of-words

In [20]:
#check which words he memorized
# it has too much noise
vectorizer.get_feature_names()

[u'1',
 u'1 star',
 u'10',
 u'100',
 u'100 page',
 u'12',
 u'12 star',
 u'13',
 u'14',
 u'15',
 u'150',
 u'150 page',
 u'16',
 u'1799',
 u'18',
 u'1984',
 u'19th',
 u'1star',
 u'1star review',
 u'2',
 u'2 star',
 u'20',
 u'200',
 u'200 page',
 u'2012',
 u'23',
 u'25',
 u'3',
 u'3 star',
 u'30',
 u'300',
 u'300 page',
 u'34',
 u'34 34',
 u'34 adult',
 u'34 adult 34',
 u'34 bad',
 u'34 book',
 u'34 casual',
 u'34 casual vacanc',
 u'34 charact',
 u'34 enjoy',
 u'34 f',
 u'34 f 34',
 u'34 get',
 u'34 good',
 u'34 harri',
 u'34 harri potter',
 u'34 novel',
 u'34 plot',
 u'34 real',
 u'34 town',
 u'34 word',
 u'35',
 u'35 star',
 u'3rd',
 u'4',
 u'4 5',
 u'4 5 star',
 u'4 letter',
 u'4 letter word',
 u'4 star',
 u'40',
 u'400',
 u'400 page',
 u'45',
 u'5',
 u'5 star',
 u'50',
 u'50 page',
 u'500',
 u'500 page',
 u'503',
 u'503 page',
 u'512',
 u'6',
 u'60',
 u'7',
 u'70',
 u'8',
 u'80',
 u'8211',
 u'8217',
 u'8217 one',
 u'8217 read',
 u'8220',
 u'8221',
 u'abandon',
 u'abbey',
 u'abil',
 u'

## Transform the feature matrix `X`

In [21]:
# TODO
train_X = vectorizer.transform(train_X)
test_X = vectorizer.transform(test_X)

In [22]:
train_X
#spare matrix is emply matrix

<1230x6272 sparse matrix of type '<type 'numpy.float64'>'
	with 79917 stored elements in Compressed Sparse Row format>

In [23]:
train_X.todense()

matrix([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.06615307],
        ..., 
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.23061146, ...,  0.        ,
          0.        ,  0.        ]])

In [24]:
# change model here!
#model = linear_model.LogisticRegression()
# random forest and run all the below - check the accuracy
#2nd Random Forest
model = ensemble.RandomForestClassifier(n_estimators = 10)

In [25]:
cross_validation.cross_val_score(model, train_X, train_c, cv=5).mean()

0.32769126760262807

In [26]:
train_c_hat = cross_validation.cross_val_predict(model, train_X, train_c, cv = 5)

In [27]:
metrics.accuracy_score(train_c, train_c_hat)

0.29268292682926828

In [28]:
pd.crosstab(train_c_hat, train_c, rownames = ['Predicted'], colnames = ['True'])

True,1,2,3,4,5
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,92,64,40,24,24
2,48,55,44,31,33
3,37,54,60,62,55
4,19,22,60,71,82
5,18,29,40,84,82


In [29]:
model.fit(train_X, train_c)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
model.score(train_X, train_c)

0.99349593495934962

In [31]:
model.score(test_X, test_c)

0.32804878048780489

In [32]:
#Randon Forest

In [33]:
model = ensemble.RandomForestClassifier(n_estimators = 10)
#for ten tres

In [34]:
cross_validation.cross_val_score(model, train_X, train_c, cv=5).mean()

0.32190013458573002

In [35]:
model = ensemble.RandomForestClassifier(n_estimators = 100)

In [36]:
cross_validation.cross_val_score(model, train_X, train_c, cv=5).mean()

0.3910050984515766

In [37]:
#change the ratings
X = df.review_text
c = df.overall.map({1: -1, 2:-1, 3:0, 4: 1, 5: 1})

In [38]:
c

0       1
1      -1
2      -1
3      -1
4      -1
       ..
2045    1
2046    0
2047   -1
2048    0
2049    1
Name: overall, dtype: int64

## Machine Learning Modeling

> # TODO...

In [39]:
#randon forests
#checking for feature importance see answer in the answer key
# there is a lot of redundant words

In [40]:
class CustomTokenizer(object):
    def __init__(self):
        self.stemmer = stem.porter.PorterStemmer()

    def __call__(self, document):
        tokens = tokenize_text(document)
        tokens = Stemmer.stem_tokens(tokens)
        return tokens

In [41]:
vectorizer = feature_extraction.text.TfidfVectorizer(tokenizer = CustomTokenizer(), ngram_range = (1, 3), min_df = 3)

In [42]:
vectorizer.fit(train_X)

AttributeError: lower not found