# DS-SF-38 | 18 | Natural Language Processing | Codelong | Starter Code

## >>> One-time setup

In [1]:

# import nltk
# nltk.download()


pass

## <<< One-time setup

## Part A | Tokenization and Stemming

In [2]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import string
import unicodedata
from nltk import tokenize, corpus, stem

from sklearn import feature_extraction, linear_model, ensemble, model_selection, metrics

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

## Tokenization

In [3]:
def tokenize_text(document):
    document = document.encode('utf-8')

    # Convert text to lowercase
    document = document.lower()

    # Tokenize
    tokens = tokenize.word_tokenize(document)

    # Remove punctuation in tokens and then remove empty tokens
    tokens = [token.translate(None, string.punctuation) for token in tokens]
    tokens = [token for token in tokens if token]

    # Remove stop words
    tokens = [token for token in tokens if not token in corpus.stopwords.words('english')]

    return tokens

In [4]:
tokens = tokenize_text("This is a sentence...  Wait, here's another.  And a third!")

tokens

['sentence', 'wait', 'another', 'third']

## Stemming

In [5]:
class Stemmer:
    stemmer = stem.porter.PorterStemmer()

    @staticmethod
    def stem_tokens(tokens):
        return [Stemmer.stemmer.stem(token) for token in tokens]

In [6]:
tokens = Stemmer.stem_tokens(tokens)

tokens

[u'sentenc', 'wait', u'anoth', 'third']

## Part B | Text Classification

Below, we will be analyzing a partial list of the reviews for J.K. Rowling's The Casual Vacancy.  (https://www.amazon.com/dp/0316228532)  We scrapped this dataset during class 3.

In [7]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-18-reviews.csv'))

In [8]:
df

Unnamed: 0,date,id,author,title,body,star_rating
0,2017-10-23,R3OQUWVA2PRCEA,Jo B.,Five Stars,Good read,5.0
1,2017-10-12,R1FJA1XIBRBLES,Cheryl Dulin,Four Stars,A great kicker in the beginning AND the end yo...,4.0
2,2017-10-12,R3CFMLYBUJ0295,Eric P Albrecht,A clear triumph of a novel!,A clear triumph of a novel. Have always loved ...,4.0
3,2017-10-09,R1FMT3KY928NJB,tlckirk,Disappointing,If it were any other author I would have stopp...,2.0
4,2017-10-07,R138WXOO423PP4,Skelsogeo,Five Stars,A great read,5.0
...,...,...,...,...,...,...
5912,2012-09-27,RT2TE0W92SL67,Tricia K.,Seriously? $17 bucks for a computer file??? ...,Premise sounds dull as dirt. For $17 for a co...,1.0
5913,2012-09-27,R14ZGYPSP9H0Y7,Pretzel,A must read,The depth of character development and storyli...,5.0
5914,2012-09-27,R1913ISIDAGQ1A,Prodigy,I love it,The book was great and I will love to re-read ...,5.0
5915,2012-09-27,R2JY771IW7RI3R,David Katz,Kendle price too expensive,I started to order the kindle edition and than...,5.0


In [9]:
df.drop(['date', 'id', 'author', 'title'],
    axis = 1,
    inplace = True)

In [10]:
df

Unnamed: 0,body,star_rating
0,Good read,5.0
1,A great kicker in the beginning AND the end yo...,4.0
2,A clear triumph of a novel. Have always loved ...,4.0
3,If it were any other author I would have stopp...,2.0
4,A great read,5.0
...,...,...
5912,Premise sounds dull as dirt. For $17 for a co...,1.0
5913,The depth of character development and storyli...,5.0
5914,The book was great and I will love to re-read ...,5.0
5915,I started to order the kindle edition and than...,5.0


### `NaN`

In [11]:
# TODO
df = df.dropna()

### Positive, neutral, and negatives reviews

In [12]:
# TODO
#  1, 2 to -1 , 3 to 0, 4,5 to 1
df.star_rating.value_counts()

5.0    1548
1.0    1206
4.0    1203
2.0     983
3.0     974
Name: star_rating, dtype: int64

In [13]:
# The reason we're doing this is we want the negatives to be negative, and postivies to be postive
df.star_rating = df.star_rating.map({1:-1, 2:-1, 3:0, 4:1, 5:1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [14]:
df.star_rating.value_counts()

 1    2751
-1    2189
 0     974
Name: star_rating, dtype: int64

In [15]:
df.polarity = df.star_rating

### Feature matrix and response vector

In [16]:
# TODO
# what is going to be my matrix and response vector
X = df.body
c = df.polarity

### Train/test sets

In [17]:
# split, added the stratify
# stratify is when we have unbalanced classes. eg 90% A, 10% B
# Keep the ratio
# this order is part of the API
train_X, test_X, train_c, test_c = model_selection.train_test_split(X, c, stratify = c, train_size = .6, random_state = 0)

### TF-IDF and `TfidfVectorizer`

In [18]:
# TODO
# What are we doing here? 
# we are calculating the tfidf matrix - this is awesome!
# what does the .fit() do? gives us the maxes and min
# here it figures out what all the words are
vectorizer = feature_extraction.text.TfidfVectorizer(stop_words = 'english')
vectorizer.fit(train_X)




TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

### Bag-of-words

In [19]:
vectorizer.get_feature_names()

[u'00',
 u'000',
 u'02',
 u'04',
 u'08',
 u'10',
 u'100',
 u'100pages',
 u'11',
 u'112',
 u'12',
 u'120',
 u'124',
 u'125',
 u'13',
 u'130',
 u'132',
 u'13hrs',
 u'14',
 u'142',
 u'149',
 u'15',
 u'150',
 u'16',
 u'17',
 u'170',
 u'1700',
 u'175',
 u'18',
 u'180',
 u'1860',
 u'19',
 u'194',
 u'1950',
 u'1960s',
 u'1984',
 u'19th',
 u'1st',
 u'20',
 u'200',
 u'2000',
 u'2004',
 u'2005',
 u'2007',
 u'2012',
 u'2013',
 u'2014',
 u'2015',
 u'2016',
 u'21',
 u'21st',
 u'22',
 u'23',
 u'24',
 u'240',
 u'25',
 u'2500',
 u'26',
 u'27',
 u'29',
 u'2nd',
 u'2star',
 u'30',
 u'300',
 u'31',
 u'323',
 u'35',
 u'350',
 u'38',
 u'380',
 u'3rd',
 u'40',
 u'400',
 u'44',
 u'450',
 u'46',
 u'50',
 u'500',
 u'500th',
 u'502',
 u'503',
 u'505',
 u'50pages',
 u'50th',
 u'512',
 u'53',
 u'56',
 u'5days',
 u'5th',
 u'60',
 u'600',
 u'60s',
 u'61',
 u'620m',
 u'639',
 u'64',
 u'65',
 u'6th',
 u'70',
 u'70s',
 u'71',
 u'73',
 u'75',
 u'79',
 u'7th',
 u'80',
 u'800',
 u'81',
 u'84',
 u'85',
 u'89',
 u'8th',
 u

### Transformed feature matrix `X`

In [20]:
# TODO
# here it actually creates the tfidf matrix, thats what transform does
# what we do on the train we do on the test
train_X = vectorizer.transform(train_X)
test_X = vectorizer.transform(test_X)

### Machine Learning Modeling

> # TODO...

In [21]:
# Do logistic regresison model and/or random forest

model = linear_model.LogisticRegression().fit(train_X, train_c)

In [22]:
model.score(train_X, train_c)

0.81031567080045097

In [23]:
# see the train_X cause theres hella 0s
train_X.todense()

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [24]:
c_hat = model.predict(train_X)

pd.crosstab(c_hat,
           train_c,
            rownames = ['Hypothesized Class'],
            colnames = ['True Class']
           )

True Class,-1,0,1
Hypothesized Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,1166,192,46
0,5,106,2
1,142,286,1603
