# DS-SF-30 | Codealong 18: Natural Language Processing

## >>> One-time setup

In [1]:
'''
import nltk
nltk.download()
'''

pass

## <<< One-time setup

In [2]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import string
import unicodedata
from nltk import tokenize, corpus, stem

from sklearn import feature_extraction, linear_model, ensemble, cross_validation, metrics, decomposition

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')



## Tokenization

In [3]:
def tokenize_text(document):
    document = document.encode('utf-8')

    # Convert text to lowercase
    document = document.lower()

    # Tokenize
    tokens = tokenize.word_tokenize(document)

    # Remove punctuation in tokens and then remove empty tokens
    tokens = [token.translate(None, string.punctuation) for token in tokens]
    tokens = [token for token in tokens if token] #gets rid of empty strings

    # Remove stop words
    tokens = [token for token in tokens if not token in corpus.stopwords.words('english')]

    return tokens

In [4]:
tokens = tokenize_text("This is a sentence...  Wait, here's another.  And a third!")

tokens

['sentence', 'wait', 'another', 'third']

## Stemming

In [5]:
class Stemmer:
    stemmer = stem.porter.PorterStemmer()

    @staticmethod
    def stem_tokens(tokens):
        return [Stemmer.stemmer.stem(token) for token in tokens]

In [6]:
tokens = Stemmer.stem_tokens(tokens)

tokens

[u'sentenc', u'wait', u'anoth', u'third']

## Book reviews

Below, we will be analyzing a partial list of the reviews for J.K. Rowling's The Casual Vacancy.  (https://www.amazon.com/dp/0316228532)

Our dataset is a subset of http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Books_5.json.gz.

In [28]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-18-reviews.csv'))

In [29]:
df

Unnamed: 0,date,id,author,title,body,star_rating
0,2016-12-11,R3SH1N77GNTD9K,Stefi,Great read,Very moving story. Great effortless writing wh...,5.0
1,2016-12-11,RVOEQK3JK4LY2,Amazon Customer,Great book! Does not disappoint,Great book! Does not disappoint. Wonderful c...,5.0
2,2016-12-11,RCU7OTNRDJBOS,Priscilla Seaton,Disturbing in its accurate reflection of human...,A very absorbing book. Not at all what I expec...,4.0
3,2016-12-10,R257OLQTPXYQ82,J,Superb,"Lives intertwined, humor,sadness, superior sto...",5.0
4,2016-12-10,R1LNKO30KAXCUM,Roberta L. Sherrill,One Star,Disappointing..... finally quit reading it. S...,1.0
...,...,...,...,...,...,...
5796,2012-09-27,RT2TE0W92SL67,Tricia K.,Seriously? $17 bucks for a computer file??? ...,Premise sounds dull as dirt. For $17 for a co...,1.0
5797,2012-09-27,R14ZGYPSP9H0Y7,Pretzel,A must read,The depth of character development and storyli...,5.0
5798,2012-09-27,R1913ISIDAGQ1A,Prodigy,I love it,The book was great and I will love to re-read ...,5.0
5799,2012-09-27,R2JY771IW7RI3R,David Katz,Kendle price too expensive,I started to order the kindle edition and than...,5.0


In [30]:
df.drop(['date', 'id', 'author', 'title'],
    axis = 1,
    inplace = True)

In [31]:
df

Unnamed: 0,body,star_rating
0,Very moving story. Great effortless writing wh...,5.0
1,Great book! Does not disappoint. Wonderful c...,5.0
2,A very absorbing book. Not at all what I expec...,4.0
3,"Lives intertwined, humor,sadness, superior sto...",5.0
4,Disappointing..... finally quit reading it. S...,1.0
...,...,...
5796,Premise sounds dull as dirt. For $17 for a co...,1.0
5797,The depth of character development and storyli...,5.0
5798,The book was great and I will love to re-read ...,5.0
5799,I started to order the kindle edition and than...,5.0


### `NaN`

In [33]:
df.isnull()
df.dropna(inplace = True)
df.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


body           0
star_rating    0
dtype: int64

In [36]:
df.star_rating.value_counts()

5.0    1497
1.0    1184
4.0    1178
2.0     972
3.0     967
Name: star_rating, dtype: int64

In [35]:
def mapStars(_df):
    
    _df[["star_rating" < 3]]
    
    return _df

SyntaxError: invalid syntax (<ipython-input-35-6267948dd5eb>, line 3)

In [38]:
df['polarity'] = df.star_rating.map({1:-1, 2:-1, 3:0, 4:1, 5:1}) #dictionary to convert from start value to polarity value:
# {<star_val>:<polarity_val>}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [39]:
df.polarity.value_counts()

 1    2675
-1    2156
 0     967
Name: polarity, dtype: int64

### Positive, neutral, and negatives reviews

several ways to address the class imbalance problem:
- upsampling:  can just duplicate values for "0" and "-1" to get 2675 samples in each to match size of "1" 
- downsampling:  can randomly subsample from "-1" and "1" to get 967 values to match size of "0"

In [55]:
pos = df[df.polarity == -1]

In [56]:
ns = df.polarity.value_counts()

In [57]:
ns.min()

967

In [60]:
#downsampling
for polarity in [-1, 0, 1]:
    n = ns[polarity] - ns.min()
    index = df[df.polarity == polarity].sample(n = n, random_state = 0).index
    df.drop(index, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [61]:
df.polarity.value_counts()

 1    967
-1    967
 0    967
Name: polarity, dtype: int64

### Feature matrix and response vector

In [62]:
X = df.body

In [68]:
c = df.polarity

### Train/test sets

In [69]:
train_X, test_X, train_c, test_c = cross_validation.train_test_split(X, c, stratify = c, train_size = .6, random_state = 0)

### TF-IDF and `TfidfVectorizer`

In [None]:
# first pass, without stemming
# vectorizer = feature_extraction.text.TfidfVectorizer(stop_words = 'english')

In [80]:
# Let's try with stemming:
'''
class CustomTokenizer(object):
    def __init__(self):
        self.stemmer = stem.porter.PorterStemmer()
        
    def __call__(self, document):
        tokens = tokenize_text(document)
        tokens = Stemmer.stem_tokens(tokens)
        return tokens
    
vectorizer = feature_extraction.text.TfidfVectorizer(token = CustomTokenizer())
'''

class CustomTokenizer(object):
    def __init__(self):
        self.stemmer = stem.porter.PorterStemmer()

    def __call__(self, document):
        tokens = tokenize_text(document)
        tokens = Stemmer.stem_tokens(tokens)
        return tokens

# not using n-grams
# vectorizer = feature_extraction.text.TfidfVectorizer(tokenizer = CustomTokenizer())

# using n-grams
vectorizer = feature_extraction.text.TfidfVectorizer(tokenizer = CustomTokenizer(), ngram_range = (1, 3), min_df = 3)
# last argument is instruction to discard any n0grams that occurs fewer than three times--> min_df = 3


In [81]:
vectorizer.fit(train_X)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<__main__.CustomTokenizer object at 0x117315650>,
        use_idf=True, vocabulary=None)

### Bag-of-words

In [82]:
vectorizer.get_feature_names()

[u'1',
 u'1 star',
 u'10',
 u'10 book',
 u'100',
 u'100 page',
 u'12',
 u'13',
 u'14',
 u'15',
 u'16',
 u'1799',
 u'18',
 u'19th',
 u'19th centuri',
 u'2',
 u'20',
 u'200',
 u'200 page',
 u'25',
 u'3',
 u'3 star',
 u'30',
 u'300',
 u'35',
 u'35 star',
 u'4',
 u'40',
 u'400',
 u'400 page',
 u'5',
 u'50',
 u'50 page',
 u'500',
 u'500 page',
 u'503',
 u'503 page',
 u'6',
 u'60',
 u'7',
 u'8',
 u'80',
 u'abandon',
 u'abil',
 u'abl',
 u'abl get',
 u'abl read',
 u'abrupt',
 u'abruptli',
 u'absenc',
 u'absolut',
 u'absolut love',
 u'absolut noth',
 u'absorb',
 u'abus',
 u'abus drug',
 u'accent',
 u'accept',
 u'accomplish',
 u'account',
 u'accur',
 u'achiev',
 u'acquaint',
 u'across',
 u'act',
 u'action',
 u'action one',
 u'actionpack',
 u'actor',
 u'actual',
 u'ad',
 u'add',
 u'addict',
 u'addit',
 u'adjust',
 u'admir',
 u'admit',
 u'admit nt',
 u'admit read',
 u'adolesc',
 u'adolesc charact',
 u'ador',
 u'ador harri',
 u'ador harri potter',
 u'adult',
 u'adult adult',
 u'adult audienc',
 u'a

### Transformed feature matrix `X`

In [83]:
# TODO
train_X = vectorizer.transform(train_X)
test_X = vectorizer.transform(test_X)

### Machine Learning Modeling

> # TODO...

have a classification problem, so can use a logistic regression or a random forest

In [84]:
model = linear_model.LogisticRegression()


In [85]:
cross_validation.cross_val_score(model, train_X, train_c, cv = 5).mean()


0.59252873563218389

In [88]:
train_c_hat = cross_validation.cross_val_predict(model, train_X, train_c, cv = 5)


In [89]:
metrics.accuracy_score(train_c, train_c_hat)

0.59252873563218389

In [90]:
pd.crosstab(train_c_hat, train_c)
# truth is columns, predictions are rows

polarity,-1,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,385,157,54
0,111,248,128
1,84,175,398


In [91]:
model.fit(train_X, train_c)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [92]:
model.score(test_X, test_c)

0.60551248923341949

In [93]:
model.score(train_X, train_c)

0.89195402298850579

the lower score of the test set compared to the train set indicates that we overfit.


In [95]:
model = ensemble.RandomForestClassifier(n_estimators = 100)

In [96]:
cross_validation.cross_val_score(model, train_X, train_c, cv = 5).mean()

0.55172413793103448

In [97]:
metrics.accuracy_score(train_c, train_c_hat)

0.59252873563218389

In [98]:
pd.crosstab(train_c_hat, train_c)

polarity,-1,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,385,157,54
0,111,248,128
1,84,175,398


In [99]:
model.fit(train_X, train_c)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [100]:
model.score(test_X, test_c)

0.55211024978466838