In [217]:
#from model import NLPModel
import pandas as pd
from sklearn.model_selection import train_test_split

## Create the model object
The NLP model object uses a Naive Bayes classifier and a TFIDF vectorizer:
```
self.clf = MultinomialNB()
self.vectorizer = TfidfVectorizer()
```

In [3]:
#model = NLPModel()

## Get the Data

In [4]:
with open ('data/train.tsv') as f:
    data = pd.read_csv(f, sep='\t')

In [5]:
data.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


## Use only the 1 star and 5 star reviews
For this example, we want to only predict positive or negative sentiment using the extreme cases.

In [6]:
pos_neg = data[(data['Sentiment']==0) | (data['Sentiment']==4)]

## Relabel as 0 for negative and 1 for positive¶

In [38]:
pos_neg['Binary'] = pos_neg.apply(
    lambda x: 0 if x['Sentiment'] == 0 else 1, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Fit a vectorizer to the vocabulary in the dataset¶


In [8]:
pos_neg.loc[:, 'Phrase']

63        This quiet , introspective and entertaining in...
66        quiet , introspective and entertaining indepen...
74                                             entertaining
77                                         is worth seeking
101         would have a hard time sitting through this one
103               have a hard time sitting through this one
117       A positively thrilling combination of ethnogra...
118       A positively thrilling combination of ethnogra...
123                                               thrilling
157       Aggressive self-glorification and a manipulati...
159         self-glorification and a manipulative whitewash
166       A comedy-drama of nearly epic proportions root...
172                                             nearly epic
176       rooted in a sincere performance by the title c...
180                                in a sincere performance
182                                   a sincere performance
183                                     

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_dtm = vectorizer.fit_transform(pos_neg.loc[:, 'Phrase'])
print(vectorizer.get_feature_names())
print(train_dtm.shape)

(16278, 10589)


In [10]:
#convert sparse matrix to array
train_dtm.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
# examine the vocabulary and document term matrix together
# pd.DataFrame(matrix, columns=columns)
pd.DataFrame(train_dtm.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,000,10,100,101,102,103,104,105,11,110,...,zings,zipper,zippy,zips,zoe,zombie,zone,zoning,zwick,zzzzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#type(train_dtm)

scipy.sparse.csr.csr_matrix

In [13]:
# examine the sparse matrix contents
# left: coordinates of non-zero values
# right: values at that point
# CountVectorizer() will output a sparse matrix
print('sparse matrix')
print(train_dtm)

print('dense matrix')
print(train_dtm.toarray())

sparse matrix
  (0, 9425)	0.18628327398154515
  (0, 7328)	0.39031920842200696
  (0, 4964)	0.49255738499916024
  (0, 428)	0.10554148577317923
  (0, 3123)	0.2807804798439986
  (0, 4770)	0.42588688105411765
  (0, 5019)	0.14318195740131803
  (0, 10491)	0.31160817213354053
  (0, 8160)	0.42588688105411765
  (1, 7328)	0.47816644084397936
  (1, 4964)	0.6034148630518353
  (1, 428)	0.12929519102473463
  (1, 3123)	0.3439743671551771
  (1, 4770)	0.5217391553418326
  (2, 3123)	1.0
  (3, 5019)	0.26185867380416106
  (3, 10491)	0.5698853695142774
  (3, 8160)	0.778884266478083
  (4, 9425)	0.24267975279372403
  (4, 10495)	0.3679775312320273
  (4, 4343)	0.30822742182010626
  (4, 4311)	0.39289020043261236
  (4, 9482)	0.3382034921377653
  (4, 8441)	0.4834391296129678
  (4, 9454)	0.3679775312320273
  :	:
  (16275, 3282)	0.7739430340841462
  (16276, 6393)	0.20474903476182243
  (16276, 9384)	0.1864969971523257
  (16276, 4727)	0.13775303194326285
  (16276, 2683)	0.24483189093585064
  (16276, 1495)	0.2180888168

In [14]:
# example text for model testing
simple_test = ['The movie is really boring']

In [15]:
# 4. transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vectorizer.transform(simple_test)
simple_test_dtm.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [16]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,000,10,100,101,102,103,104,105,11,110,...,zings,zipper,zippy,zips,zoe,zombie,zone,zoning,zwick,zzzzzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
pos_neg.loc[:,['Phrase','Sentiment']]

Unnamed: 0,Phrase,Sentiment
63,"This quiet , introspective and entertaining in...",4
66,"quiet , introspective and entertaining indepen...",4
74,entertaining,4
77,is worth seeking,4
101,would have a hard time sitting through this one,0
103,have a hard time sitting through this one,0
117,A positively thrilling combination of ethnogra...,4
118,A positively thrilling combination of ethnogra...,4
123,thrilling,4
157,Aggressive self-glorification and a manipulati...,0


In [29]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = pos_neg.loc[:,'Phrase']
y = pos_neg.loc[:,'Sentiment']
print(X.shape)
print(y.shape)

(16278,)
(16278,)


In [158]:
# split X and y into training and testing sets
# by default, it splits 75% training and 25% test
# random_state=1 for reproducibility
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(12208,)
(4070,)
(12208,)
(4070,)


pandas.core.series.Series

Why are we splitting into training and testing sets before vectorizing?

Background of train/test split

    Train/test split is for model evaluation
        Model evaluation is to simulate the future
        Past data is exchangeable for future data
        We pretend some of our past data is coming into our future data
        By training, predicting and evaluating the data, we can check the performance of our model

Vectorize then split

    If we vectorize then we train/test split, our document-term matrix would contain every single feature (word) in the test and training sets
        What we want is to simulate the real world
        We would always see words we have not seen before so this method is not realistic and we cannot properly evaluate our models

Split then vectorize (correct way)

    We do the train/test split before the CountVectorizer to properly simulate the real world where our future data contains words we have not seen before

After you train your data and chose the best model, you would then train on all of your data before predicting actual future data to maximize learning.


In [207]:
# 1. import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer

# 2. instantiate the vectorizer
vect = CountVectorizer()

In [208]:
# learn training data vocabulary, then use it to create a document-term matrix

# equivalently: combine fit and transform into a single step
# this is faster and what most people would do
X_train_dtm = vect.fit_transform(X_train)



# examine the document-term matrix
X_train_dtm



<12208x9738 sparse matrix of type '<class 'numpy.int64'>'
	with 115611 stored elements in Compressed Sparse Row format>

In [209]:
# 4. transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

# you can see that the number of columns, 7456, is the same as what we have learned above in X_train_dtm

<4070x9738 sparse matrix of type '<class 'numpy.int64'>'
	with 36845 stored elements in Compressed Sparse Row format>


## 5. Building and evaluating a model

We will use multinomial Naive Bayes:

    The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.



In [210]:
# 1. import
from sklearn.naive_bayes import MultinomialNB

# 2. instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()

In [211]:
# 3. train the model 
# using X_train_dtm (timing it with an IPython "magic command")

%time nb.fit(X_train_dtm, y_train)

CPU times: user 5.44 ms, sys: 2.7 ms, total: 8.14 ms
Wall time: 6.14 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Naive bayes is fast as seen above

    This matters when we're using 10-fold cross-validation with a large dataset

In [212]:
# 4. make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [213]:
y_pred_class

array([4, 0, 4, ..., 0, 0, 4])

In [214]:
# BEGIN TEST 
#  THIS IS ONLY A test sample
X_test_review = ["It's an action comedy that isn't particularly effective as an action movie or a comedy.","this is a great movie"]
X_test_review
X_test_review_dtm = vect.transform(X_test_review)
X_test_review_dtm.toarray()



array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [215]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(X_test_review_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,000,10,100,101,102,103,104,105,11,112,...,zingers,zings,zipper,zippy,zips,zoe,zombie,zone,zoning,zzzzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [216]:
y_pred_class_test = nb.predict(X_test_review_dtm)
y_pred_class
## End of Test

array([0, 4])

In [42]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9307125307125307

In [43]:
# examine class distribution
print(y_test.value_counts())
# there is a majority class of 0 here, hence the classes are skewed

# calculate null accuracy (for multi-class classification problems)
# .head(1) assesses the value 1208
null_accuracy = y_test.value_counts().head(1) / len(y_test)
print('Null accuracy:', null_accuracy)

# Manual calculation of null accuracy by always predicting the majority class
print('Manual null accuracy:',(1208 / (1208 + 185)))

4    2326
0    1744
Name: Sentiment, dtype: int64
Null accuracy: 4    0.571499
Name: Sentiment, dtype: float64
Manual null accuracy: 0.8671931083991385


In this case, we can see that our accuracy (0.9885) is higher than the null accuracy (0.8672)

In [44]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[1588,  156],
       [ 126, 2200]])

Confusion matrix
[TN FP
FN TP]


In [49]:
# print message text for the false positives (ham incorrectly classified as spam)

X_test[y_pred_class > y_test]

# alternative less elegant but easier to understand
# X_test[(y_pred_class==1) & (y_test==0)]

19497     's no palpable chemistry between Lopez and mal...
137116                   taken the protagonists a full hour
137467                              is n't very interesting
139708                           not smart and not engaging
145415                                        undisciplined
34328     of being overrun by corrupt and hedonistic wea...
95172     struts about with `` courage '' pinned to its ...
31116                                              cowardly
70301                             Dramatically lackluster .
76360     Sheridan 's take on the author 's schoolboy me...
128790    a strange film , one that was hard for me to w...
111028           arrest development in a dead-end existence
117782    the German film industry can not make a deligh...
132031    of his contradictory , self-hating , self-dest...
71080     paint-by-number American blockbusters like Pea...
115964    Franco is an excellent choice for the walled-o...
89459     A sour attempt at making a Far

In [50]:
# print message text for the false negatives (spam incorrectly classified as ham)

X_test[y_pred_class < y_test]
# alternative less elegant but easier to understand
# X_test[(y_pred_class=0) & (y_test=1)]

153050    ... a good film that must have baffled the fol...
72159     The concept behind Kung Pow : Enter the Fist i...
132440    , this is a seriously intended movie that is n...
146821                                     enjoy this movie
92509     have the worthy successor to A Better Tomorrow...
38787                appetizer that leaves you wanting more
142762    I ca n't remember the last time I saw an audie...
154597    Maybe not a classic , but a movie the kids wil...
97626                                 A superlative B movie
81441                              Like the best 60 Minutes
117       A positively thrilling combination of ethnogra...
139805    re-create the excitement of such '50s flicks a...
73446                        like something wholly original
99417     Even if you 've seen `` Stomp '' -LRB- the sta...
136110    is hard not to be especially grateful for free...
84447     does not rely on dumb gags , anatomical humor ...
45465              It 's never dull and 

In [51]:
# example false negative
X_test[128936]

"If you 're not deeply touched by this movie , check your pulse ."

In [52]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)

# Numpy Array with 2C
# left Column: probability class 0
# right C: probability class 1
# we only need the right column 
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

# Naive Bayes predicts very extreme probabilites, you should not take them at face value



array([0.99988671, 0.00478475, 0.99541528, ..., 0.00570371, 0.44775722,
       0.98756687])

In [53]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)



0.9774182160972492

```
AUC is useful as a single number summary of classifier performance
Higher value = better classifier
If you randomly chose one positive and one negative observation, AUC represents the likelihood that your classifier will assign a higher predicted probability to the positive observation
AUC is useful even when there is high class imbalance (unlike classification accuracy)
    Fraud case
        Null accuracy almost 99%
        AUC is useful here
```


## 6. Comparing models
```
We will compare multinomial Naive Bayes with logistic regression:

    Logistic regression, despite its name, is a linear model for classification rather than regression. Logistic regression is also known in the literature as logit regression, maximum-entropy classification (MaxEnt) or the log-linear classifier. In this model, the probabilities describing the possible outcomes of a single trial are modeled using a logistic function.
```


In [54]:
# 1. import
from sklearn.linear_model import LogisticRegression

# 2. instantiate a logistic regression model
logreg = LogisticRegression()


In [55]:
# 3. train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

CPU times: user 85.9 ms, sys: 35.5 ms, total: 121 ms
Wall time: 189 ms




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

```
This is a lot slower than Naive Bayes

    Naive Bayes cannot take negative numbers while Logistic Regression can
```


In [57]:
# 4. make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [58]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([0.98380428, 0.10830224, 0.94294755, ..., 0.01149964, 0.63027185,
       0.95903317])

This is a good model if you care about the probabilities.

In [59]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.9368550368550369

In [60]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.9792695457019571

## 7. Examining a model for further insight
```
We will examine the our trained Naive Bayes model to calculate the approximate "spamminess" of each token.
```

In [61]:
# store the vocabulary of X_train
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)

9738

In [62]:
# examine the first 50 tokens
print(X_train_tokens[0:50])

['000', '10', '100', '101', '102', '103', '104', '105', '11', '112', '12', '120', '127', '129', '12th', '13', '14', '146', '15', '16', '19', '1920', '1930s', '1937', '1950', '1950s', '1953', '1957', '1958', '1959', '1960', '1962', '1970s', '1975', '1984', '1989', '1991', '1993', '1997', '19th', '20', '2000', '2001', '2002', '20th', '21', '21st', '22', '24', '270']


In [63]:
# examine the last 50 tokens
print(X_train_tokens[-50:])

['yeah', 'year', 'yearning', 'years', 'yelling', 'yellow', 'yes', 'yesterday', 'yesteryear', 'yet', 'yiddish', 'yielded', 'yoda', 'york', 'yorkers', 'yosuke', 'you', 'young', 'younger', 'youngsters', 'your', 'yours', 'yourself', 'youth', 'youthful', 'yu', 'yuppie', 'zaidan', 'zany', 'zaza', 'zeal', 'zealand', 'zealanders', 'zeitgeist', 'zelda', 'zero', 'zeroes', 'zhang', 'zhuangzhuang', 'zinger', 'zingers', 'zings', 'zipper', 'zippy', 'zips', 'zoe', 'zombie', 'zone', 'zoning', 'zzzzzzzzz']


In [64]:
# Naive Bayes counts the number of times each token appears in each class
# trailing underscore - learned during fitting
nb.feature_count_

array([[ 1., 24.,  4., ...,  3.,  1.,  5.],
       [ 0.,  3.,  2., ...,  3.,  0.,  0.]])

In [65]:
# rows represent classes, columns represent tokens
nb.feature_count_.shape

(2, 9738)

## Naive Bayes Summary
```
For each token, it calculates the conditional probability of that token given each class
    Does this for every token and both classes
To make a prediction
    Calculates conditional probability of a class given the token in that message
Bottomline to how it thinks
    Learns spamminess of each token
        If have a lot of ham then class = ham
        If have a lot of spam then class = spam
```

In [84]:
# number of times each token appears across all Negative messages
neg_token_count = nb.feature_count_[0, :]
neg_token_count

array([ 1., 24.,  4., ...,  3.,  1.,  5.])

In [85]:
# number of times each token appears across all Positive messages
pos_token_count = nb.feature_count_[1, :]
pos_token_count

array([0., 3., 2., ..., 3., 0., 0.])

In [87]:
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'pos':pos_token_count, 'neg':neg_token_count}).set_index('token')
tokens.head()

Unnamed: 0_level_0,pos,neg
token,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.0,1.0
10,3.0,24.0
100,2.0,4.0
101,0.0,5.0
102,0.0,2.0


In [88]:
# examine 5 random DataFrame rows
# random_state=6 is a seed for reproducibility
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,pos,neg
token,Unnamed: 1_level_1,Unnamed: 2_level_1
westerners,1.0,0.0
seconds,0.0,10.0
visitor,1.0,0.0
pollution,3.0,1.0
impressive,37.0,0.0


In [89]:
# Naive Bayes counts the number of observations in each class
nb.class_count_

array([5328., 6880.])

```

    5238 Negative
    6880 Positive

Before we can calculate the "spamminess" of each token, we need to avoid dividing by zero and account for the class imbalance.
```

In [90]:
# add 1 to ham and spam counts to avoid dividing by 0
tokens['pos'] = tokens.pos + 1
tokens['neg'] = tokens.neg + 1
tokens.sample(5, random_state=6)



Unnamed: 0_level_0,pos,neg
token,Unnamed: 1_level_1,Unnamed: 2_level_1
westerners,2.0,1.0
seconds,1.0,11.0
visitor,2.0,1.0
pollution,4.0,2.0
impressive,38.0,1.0


In [91]:
# convert the ham and spam counts into frequencies
tokens['pos'] = tokens.pos / nb.class_count_[0]
tokens['neg'] = tokens.neg / nb.class_count_[1]
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,pos,neg
token,Unnamed: 1_level_1,Unnamed: 2_level_1
westerners,0.000375,0.000145
seconds,0.000188,0.001599
visitor,0.000375,0.000145
pollution,0.000751,0.000291
impressive,0.007132,0.000145


In [94]:
# calculate the ratio of spam-to-ham for each token
tokens['pos_ratio'] = tokens.pos / tokens.neg
tokens.sample(5, random_state=6)

Unnamed: 0_level_0,pos,neg,pos_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
westerners,0.000375,0.000145,2.582583
seconds,0.000188,0.001599,0.11739
visitor,0.000375,0.000145,2.582583
pollution,0.000751,0.000291,2.582583
impressive,0.007132,0.000145,49.069069


```
You should not look at spam ratio and directly interpret

    impressive is the most negative word
    seconds is the least negative word
```


In [95]:
# examine the DataFrame sorted by spam_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('pos_ratio', ascending=False)

Unnamed: 0_level_0,pos,neg,pos_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
beautiful,0.017267,0.000145,118.798799
powerful,0.015953,0.000145,109.759760
beautifully,0.015015,0.000145,103.303303
gorgeous,0.013138,0.000145,90.390390
wonderful,0.012950,0.000145,89.099099
touching,0.012950,0.000145,89.099099
excellent,0.012200,0.000145,83.933934
moving,0.018393,0.000291,63.273273
inventive,0.008446,0.000145,58.108108
engrossing,0.008071,0.000145,55.525526


In [98]:
# look up the spam_ratio for a given token
tokens.loc['moving', 'pos_ratio']

63.27327327327327