# DS-SF-27 | Codealong 16 | Introduction to Natural Language Processing

## >>> One-time setup

In [1]:
'''
import nltk
nltk.download()
'''

pass

## <<< One-time setup

In [2]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import string
import unicodedata
from nltk import tokenize, corpus, stem

from sklearn import feature_extraction, linear_model, ensemble, cross_validation, metrics, decomposition

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

## Tokenization

In [3]:
def tokenize_text(document):
    document = document.encode('utf-8')

    # Convert text to lowercase
    document = document.lower()

    # Tokenize
    tokens = tokenize.word_tokenize(document)

    # Remove punctuation in tokens and then remove empty tokens
    tokens = [token.translate(None, string.punctuation) for token in tokens]
    tokens = [token for token in tokens if token]

    # Remove stop words
    tokens = [token for token in tokens if not token in corpus.stopwords.words('english')]

    return tokens

In [4]:
tokens = tokenize_text("This is a sentence...  Wait, here's another.  And a third!")

tokens

['sentence', 'wait', 'another', 'third']

## Stemming

In [5]:
class Stemmer:
    stemmer = stem.porter.PorterStemmer()

    @staticmethod
    def stem_tokens(tokens):
        return [Stemmer.stemmer.stem(token) for token in tokens]

In [6]:
tokens = Stemmer.stem_tokens(tokens)

tokens

[u'sentenc', u'wait', u'anoth', u'third']

## Book reviews

Below, we will be analyzing a partial list of the reviews for J.K. Rowling's The Casual Vacancy.  (https://www.amazon.com/dp/0316228532)

Our dataset is a subset of http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Books_5.json.gz.

In [7]:
df = pd.read_csv(os.path.join('..', 'datasets', 'reviews_Books_5-0316228532.csv'))

In [8]:
df

Unnamed: 0,asin,reviewer_id,reviewer_name,summary,review_text,overall,review_time,unix_review_time,helpful
0,316228532,AY2UIGHCB4VPB,,but a good read!,"A departure for her, but a good read!",5,"07 12, 2014",1405123200,"[0, 0]"
1,316228532,A2L17U0TWH9UWS,1075,Not worth the time,I had a hard time remembering who each charact...,2,"11 12, 2013",1384214400,"[0, 1]"
2,316228532,A2R63TBVG5OAF6,12121,The Casual Vacancy,This is the only review I have ever written. ...,1,"10 1, 2012",1349049600,"[13, 25]"
3,316228532,ACU39L9G696US,123esmo,Expecting more from J.K. Rowling,"I was expecting more from J.K. Rowling, it's a...",2,"01 10, 2013",1357776000,"[0, 1]"
4,316228532,A3N7KY1PBMF880,&#34;Bad Cat!&#34;,Sorry That I Bought It.,As big a fan as I am of J K Rowling's Harry Po...,1,"05 11, 2013",1368230400,"[0, 3]"
...,...,...,...,...,...,...,...,...,...
2045,316228532,A1SCYWLS37YR50,ZC,Spectacular prose in a rambling story,Spectacular prose in a rambling story that see...,5,"02 12, 2014",1392163200,"[1, 1]"
2046,316228532,A1POFVVXUZR3IQ,Z Hayes,"Difficult to get into, but has its moments",Although I am a great fan of the Harry Potter ...,3,"07 18, 2013",1374105600,"[1, 1]"
2047,316228532,A1YSU2VSUJZAR5,zolteg59,The Casual Vacancy,"While the story was intriguing, and I am a hug...",1,"11 11, 2012",1352592000,"[0, 1]"
2048,316228532,A2ZF888HX9YR8E,Zoobeefoo,A better read for Brits perhaps?,What an odd book! The adolescent characters a...,3,"12 30, 2012",1356825600,"[2, 3]"


In [9]:
df.drop(['asin', 'reviewer_id', 'reviewer_name', 'summary', 'review_time', 'unix_review_time', 'helpful'],
    axis = 1,
    inplace = True)

In [10]:
df

Unnamed: 0,review_text,overall
0,"A departure for her, but a good read!",5
1,I had a hard time remembering who each charact...,2
2,This is the only review I have ever written. ...,1
3,"I was expecting more from J.K. Rowling, it's a...",2
4,As big a fan as I am of J K Rowling's Harry Po...,1
...,...,...
2045,Spectacular prose in a rambling story that see...,5
2046,Although I am a great fan of the Harry Potter ...,3
2047,"While the story was intriguing, and I am a hug...",1
2048,What an odd book! The adolescent characters a...,3


In [11]:
df.overall.value_counts(dropna = False)

4    464
5    457
3    397
2    373
1    359
Name: overall, dtype: int64

In [12]:
# TODO
# check empty string
df.isnull().sum()

review_text    0
overall        0
dtype: int64

In [13]:
#define feature matrix
X = df.review_text
c = df.overall

In [14]:
#todo
c

0       5
1       2
2       1
3       2
4       1
       ..
2045    5
2046    3
2047    1
2048    3
2049    5
Name: overall, dtype: int64

In [15]:
#to do
X

0                   A departure for her, but a good read!
1       I had a hard time remembering who each charact...
2       This is the only review I have ever written.  ...
3       I was expecting more from J.K. Rowling, it's a...
4       As big a fan as I am of J K Rowling's Harry Po...
                              ...                        
2045    Spectacular prose in a rambling story that see...
2046    Although I am a great fan of the Harry Potter ...
2047    While the story was intriguing, and I am a hug...
2048    What an odd book!  The adolescent characters a...
2049    Chatty and immediately comfortable to read. It...
Name: review_text, dtype: object

## Train/test sets

In [16]:
train_X, test_X, train_c, test_c = cross_validation.train_test_split(X, c, train_size = .6, random_state = 0)

## TF-IDF and `TfidfVectorizer`

In [17]:
# TODO
# similar to scaler meaning we normalize
vectorizer = feature_extraction.text.TfidfVectorizer(stop_words = 'english')

In [18]:
vectorizer.fit(train_X)
# going to create collumn for each
# fit train the data

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

## Bag-of-words

In [19]:
#check which words he memorized
# 1st round
# it seems to have too much noise, numbers, mixed number & letters, repeated words with same root
# 2nd round - todo better clean up
vectorizer.get_feature_names()

[u'00',
 u'000',
 u'10',
 u'100',
 u'1019',
 u'103',
 u'11',
 u'112',
 u'11th',
 u'12',
 u'12tried',
 u'13',
 u'130',
 u'14',
 u'140',
 u'149',
 u'15',
 u'150',
 u'16',
 u'16s',
 u'16terri',
 u'17',
 u'170',
 u'1700',
 u'18',
 u'180',
 u'18th',
 u'19',
 u'1950',
 u'1956',
 u'1960s',
 u'1984',
 u'1990',
 u'1990s',
 u'19th',
 u'1am',
 u'1antagonist',
 u'1c',
 u'1minor',
 u'1setting',
 u'1st',
 u'1storyline',
 u'1theme',
 u'20',
 u'200',
 u'2012',
 u'2013',
 u'2014',
 u'20th',
 u'21',
 u'21st',
 u'22',
 u'2293',
 u'23',
 u'23yo',
 u'24',
 u'25',
 u'250',
 u'27',
 u'28',
 u'289',
 u'2c',
 u'2overall',
 u'2writing',
 u'30',
 u'300',
 u'30am',
 u'323',
 u'34',
 u'342',
 u'35',
 u'350',
 u'380',
 u'383',
 u'3rd',
 u'40',
 u'400',
 u'42',
 u'45',
 u'450',
 u'46',
 u'47',
 u'475',
 u'4for',
 u'4th',
 u'50',
 u'500',
 u'5000',
 u'502',
 u'503',
 u'512',
 u'53',
 u'5mrs',
 u'5th',
 u'60',
 u'600',
 u'62',
 u'64',
 u'70',
 u'715',
 u'73',
 u'739',
 u'74',
 u'75',
 u'77',
 u'80',
 u'800',
 u'8045',

## Transform the feature matrix `X`

In [20]:
# TODO
train_X = vectorizer.transform(train_X)
test_X = vectorizer.transform(test_X)

In [21]:
train_X
#sparse matrix is empty matrix

<1230x9027 sparse matrix of type '<type 'numpy.float64'>'
	with 57398 stored elements in Compressed Sparse Row format>

In [22]:
train_X.todense()

matrix([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ..., 
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.22501177, ...,  0.        ,
          0.        ,  0.        ]])

In [23]:
# change model here!
#1st LOGISTIC REGRESSION
model = linear_model.LogisticRegression()
# random forest and run all the below - check the accuracy and compare wth logistic regression

In [24]:
cross_validation.cross_val_score(model, train_X, train_c, cv=5).mean()

0.39904935449356432

In [27]:
train_c_hat = cross_validation.cross_val_predict(model, train_X, train_c, cv = 5)

In [28]:
metrics.accuracy_score(train_c, train_c_hat)

0.39918699186991868

In [29]:
pd.crosstab(train_c_hat, train_c, rownames = ['Predicted'], colnames = ['True'])

True,1,2,3,4,5
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,102,49,20,6,10
2,45,54,35,7,8
3,28,48,56,27,13
4,17,36,84,128,94
5,22,37,49,104,151


In [30]:
model.fit(train_X, train_c)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [31]:
model.score(train_X, train_c)

0.91951219512195126

In [32]:
model.score(test_X, test_c)

0.4195121951219512

In [None]:
#Randon Forest

In [33]:
model = ensemble.RandomForestClassifier(n_estimators = 10)
#for ten tres

In [34]:
cross_validation.cross_val_score(model, train_X, train_c, cv=5).mean()

0.28437694197537028

In [35]:
model = ensemble.RandomForestClassifier(n_estimators = 100)

In [36]:
cross_validation.cross_val_score(model, train_X, train_c, cv=5).mean()

0.37220179074224102

In [37]:
#change the ratings
X = df.review_text
c = df.overall.map({1: -1, 2:-1, 3:0, 4: 1, 5: 1})

In [38]:
c

0       1
1      -1
2      -1
3      -1
4      -1
       ..
2045    1
2046    0
2047   -1
2048    0
2049    1
Name: overall, dtype: int64

## Machine Learning Modeling

> # TODO...

In [None]:
#randon forests
#checking for feature importance see answer in the answer key
# there is a lot of redundant words

In [39]:
class CustomTokenizer(object):
    def __init__(self):
        self.stemmer = stem.porter.PorterStemmer()

    def __call__(self, document):
        tokens = tokenize_text(document)
        tokens = Stemmer.stem_tokens(tokens)
        return tokens

In [40]:
vectorizer = feature_extraction.text.TfidfVectorizer(tokenizer = CustomTokenizer(), ngram_range = (1, 3), min_df = 3)

In [None]:
vectorizer.fit(train_X)