# Text Classification
## Using stemming and lemmatization in our vectorizers

In [1]:
#Imports
from time import time
import pandas as pd
pd.set_option("max.colwidth", 500)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA, TruncatedSVD, NMF
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize
from nltk.tokenize import TreebankWordTokenizer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.util import ngrams
from textblob import TextBlob
from time import time
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report, roc_auc_score, roc_curve




In [2]:
#Load in yelp review data
path = "yelp.csv"
yelp = pd.read_csv(path, encoding='unicode-escape')
yelp.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakfast and it was excellent. The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure. Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning. It looked like the place fills up pretty quickly so the earlier you get here the better.\r\n\r\nDo yourself a favor and get their Bloody Mary. It was phenomenal and simply the best I've ever had. I'm pretty sure they only use ing...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,"I have no idea why some people give bad reviews about this place. It goes to show you, you can please everyone. They are probably griping about something that their own fault...there are many people like that.\r\n\r\nIn any case, my friend and I arrived at about 5:50 PM this past Sunday. It was pretty crowded, more than I thought for a Sunday evening and thought we would have to wait forever to get a seat but they said we'll be seated when the girl comes back from seating someone else. We we...",review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I also dig their candy selection :),review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!! It's very convenient and surrounded by a lot of paths, a desert xeriscape, baseball fields, ballparks, and a lake with ducks.\r\n\r\nThe Scottsdale Park and Rec Dept. does a wonderful job of keeping the park clean and shaded. You can find trash cans and poopy-pick up mitts located all over the park and paths.\r\n\r\nThe fenced in area is huge to let the dogs run, play, and sniff!",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,"General Manager Scott Petello is a good egg!!! Not to go into detail, but let me assure you if you have any issues (albeit rare) speak with Scott and treat the guy with some respect as you state your case and I'd be surprised if you don't walk out totally satisfied as I just did. Like I always say..... ""Mistakes are inevitable, it's how we recover from them that is important""!!!\r\n\r\nThanks to Scott and his awesome staff. You've got a customer for life!! .......... :^)",review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [3]:
# Create a new DataFrame called yelp_best_worst that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

In [4]:
# define X and y
X = yelp_best_worst.text
y = yelp_best_worst.stars

#Null accuracy
print y.value_counts(normalize=True)

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

5    0.816691
1    0.183309
Name: stars, dtype: float64


In [5]:
#Look at the analyzer section of the CountVectorizer doc strings
CountVectorizer()

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
#The analyzer argument allows us to upload our function to transform/tokenize the words.

In [7]:
# define a function that accepts text and returns a list of stems
def word_tokenize_stem(text):
    #Transform and tokenize words using TextBlob
    words = TextBlob(text).words
    #Initialize stemmer
    stemmer = SnowballStemmer("english")
    #Return a list of the stems
    return [stemmer.stem(word) for word in words]

In [8]:
# define a function that accepts text and returns a list of lemons (noun version)
def word_tokenize_lemma(text):
    #Transform and tokenize words using TextBlob
    words = TextBlob(text).words
    #Return a list of lemons
    return [word.lemmatize() for word in words]

In [9]:
# define a function that accepts text and returns a list of lemons (verb version)
def word_tokenize_lemma_verb(text):
    words = TextBlob(text).words
    #Return a list of lemons    
    return [word.lemmatize(pos="v") for word in words]

In [10]:
def text_model_evaluator(vect):
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    print "Features: ", X_train_dtm.shape[1]
    print "Training Score: ", nb.score(X_train_dtm, y_train)
    print "Testing Score: ", nb.score(X_test_dtm, y_test)

### Definitions:
** Word tokenization**: 
Separate text into units such as sentences or words, gives structure to previously unstructured text. It is relatively easy with English language text, not easy with some languages.
** rules of tokenization**:
The text is first tokenized into sentences using the PunktSentenceTokenizer. Then each sentence is tokenized into words using 4 different word tokenizers:
- TreebankWordTokenizer
- WordPunctTokenizer
- PunctWordTokenizer
- WhitespaceTokenizer

**Stemming and lemmatization**
**Stemming**: Reduce a word to its base/stem/root form. Often makes sense to treat related words the same way
Stemmed words are usually not shown to users (used for analysis/indexing)
Some search engines treat words with the same stem as synonyms

**Lemmatization**: Derive the canonical form ('lemma') of a word. Can be better than stemming
Notes: Uses a dictionary-based approach (slower than stemming)

**Term Frequency-Inverse Document Frequency (TF-IDF)**: Computes "relative frequency" that a word appears in a document compared to its frequency across all documents.

In [11]:
#Initialize Count Vectorizer with stop_words set to english and analyzer to word_tokenize_stem
vect = CountVectorizer(stop_words="english", analyzer=word_tokenize_stem)
#Pass vectorizer into function
text_model_evaluator(vect)

Features:  13273
Training Score:  0.970626631854
Testing Score:  0.924657534247


In [12]:
#Intialize Count Vectorizer with stop_words set to english and analyzer to word_tokenize_lemma
vect = CountVectorizer(stop_words="english", analyzer=word_tokenize_lemma)
#Pass vectorizer into function
text_model_evaluator(vect)

Features:  20599
Training Score:  0.974216710183
Testing Score:  0.904109589041


In [13]:
#Intialize Count Vectorizer with stop_words set to english and analyzer to word_tokenize_lemma_verb
vect = CountVectorizer(stop_words="english", analyzer=word_tokenize_lemma_verb)
#Pass vectorizer into function
text_model_evaluator(vect)

Features:  19431
Training Score:  0.974216710183
Testing Score:  0.906066536204


### Tfidf

In [14]:
#Intialize Tfidf Vectorizer with stop_words set to english and analyzer to word_tokenize_stem
vect = TfidfVectorizer(stop_words="english", analyzer=word_tokenize_stem)
#Pass vectorizer into function
text_model_evaluator(vect)

Features:  13273
Training Score:  0.816906005222
Testing Score:  0.819960861057


In [15]:
#Intialize Tfidf Vectorizer with stop_words set to english and analyzer to word_tokenize_lemma
vect = TfidfVectorizer(stop_words="english", analyzer=word_tokenize_lemma)
#Pass vectorizer into function
text_model_evaluator(vect)

Features:  20599
Training Score:  0.817232375979
Testing Score:  0.819960861057


### Count - randomized search

In [16]:
#Make pipeline for countvectorizer and naive bayes model
pipe_cv = make_pipeline(CountVectorizer(), MultinomialNB())

#Initialize parameters for count vectorizer
param_grid_cv = {}
param_grid_cv["countvectorizer__max_features"] = [1000, 2500 ,5000, 7500,10000]
param_grid_cv["countvectorizer__ngram_range"] = [(1,1), (1,2), (2,2)]
param_grid_cv["countvectorizer__lowercase"] = [True, False]
param_grid_cv["countvectorizer__binary"] = [True, False]
param_grid_cv["countvectorizer__analyzer"] = ["word", word_tokenize_stem,
                                              word_tokenize_lemma, word_tokenize_lemma_verb]

In [18]:
#Randomized grid search with n_iter = 5
randsearch_cv = RandomizedSearchCV(pipe_cv, n_iter = 5,
                        param_distributions = param_grid_cv, cv = 5, scoring = "accuracy")
#Time the code 
t = time()
#Fit grid on data
randsearch_cv.fit(X, y)
#Print time difference
print time() - t

460.15984416


In [19]:
#Best params
print randsearch_cv.best_params_
#Best score
print randsearch_cv.best_score_

{'countvectorizer__lowercase': True, 'countvectorizer__analyzer': <function word_tokenize_lemma at 0x118d0a500>, 'countvectorizer__ngram_range': (1, 1), 'countvectorizer__binary': True, 'countvectorizer__max_features': 5000}
0.928781204112


### TF-IDF: randomized search

In [20]:
#Make pipeline for tfidfvectorizer and naive bayes model
pipe_tf = make_pipeline(TfidfVectorizer(), MultinomialNB())


#Intialize parameters for tfidf vectorizer
param_grid_tf = {}
param_grid_tf["tfidfvectorizer__max_features"] = [1000, 2500 ,5000, 7500,10000]
param_grid_tf["tfidfvectorizer__ngram_range"] = [(1,1), (1,2), (2,2)]
param_grid_tf["tfidfvectorizer__lowercase"] = [True, False]
param_grid_tf["tfidfvectorizer__binary"] = [True, False]
param_grid_tf["tfidfvectorizer__analyzer"] = ["word", word_tokenize_stem,
                                              word_tokenize_lemma, word_tokenize_lemma_verb]

In [21]:
#Randomized grid search with n_iter = 10
randsearch_tf = RandomizedSearchCV(pipe_tf, n_iter = 10,
                        param_distributions = param_grid_tf, cv = 5, scoring = "accuracy")
#Time the code 
t = time()
#Fit grid on data
randsearch_tf.fit(X, y)
#Print time difference
print time() - t

2338.76286483


In [22]:
#Best params
print randsearch_tf.best_params_
#Best score
print randsearch_tf.best_score_

{'tfidfvectorizer__analyzer': <function word_tokenize_stem at 0x118d0a398>, 'tfidfvectorizer__ngram_range': (2, 2), 'tfidfvectorizer__lowercase': False, 'tfidfvectorizer__binary': True, 'tfidfvectorizer__max_features': 1000}
0.868575624082


### Best Accuracy = randsearch_cv

In [25]:
y_pred = randsearch_cv.predict(X_test)
print classification_report(y_test, y_pred)
print confusion_matrix(y_test, y_pred)


             precision    recall  f1-score   support

          1       0.83      0.91      0.87       184
          5       0.98      0.96      0.97       838

avg / total       0.95      0.95      0.95      1022

[[167  17]
 [ 35 803]]


In [36]:
cm = confusion_matrix(y_test, y_pred)
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
tot_a_1 = TN + FP
tot_a_2 = FN + TP
tot_p_1 = TN + FN
tot_p_2 = FP + TP
tot_g = tot_a_1+tot_a_2
mx =np.array(["TN",TN,"FP",FP,tot_a_1,"FN",FN,"TP", TP, tot_a_2,"", tot_p_1,"", tot_p_2, tot_g]).reshape(3,5)
cols = [" ","pred (1)", " ", "pred (5)",  "total"]
index = ["1", "5", "total"]
cm_df = pd.DataFrame(mx, columns=cols, index=index )
print cm_df
print "\t"
tpr = round(TP / float(TP + FN),4)
tnr = round(TN / float(TN + FP),4)
fpr = round(FP / float(TN + FP),4)
fnr = round(FN / float(FN + TP),4)
print "Sensitivity or Recall or True Positive Rate - TPR:", tpr*100, "%"
print "Specificity or True Negative Rate - TNR:", tnr*100,"%"
print "False Positive Rate - FPR:", fpr*100,"%"
print "False Negative Rate - FNR:", fnr*100,"%"

          pred (1)     pred (5) total
1      TN       76  FP      108   184
5      FN        3  TP      835   838
total           79          943  1022
	
Sensitivity or Recall or True Positive Rate - TPR: 99.64 %
Specificity or True Negative Rate - TNR: 41.3 %
False Positive Rate - FPR: 58.7 %
False Negative Rate - FNR: 0.36 %
