In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import glob
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# open_pickle

def open_pickle(path):
    with open(path, 'rb') as f:
        X = pickle.load(f)
    return X

### Load data

In [21]:
#Load dataset of sentence [relevant,-relevant]

X_train_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_xtrain.pickle')
X_test_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_xtest.pickle')
y_train_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_ytrain.pickle')
y_test_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_ytest.pickle')

#Load dataset of [whole corpus]

X_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtrain.pickle')
X_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_xtest.pickle')
y_train_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytrain.pickle')
y_test_original = open_pickle('../data/imdb/imdb_original_preprocessed_ytest.pickle')

#Load dataset of sentence [+/-]

X_train_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_xtrain.pickle')
X_test_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_xtest.pickle')
y_train_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_ytrain.pickle')
y_test_np_sentence = open_pickle('../data/imdb-sentence/imdb_sentence_np_ytest.pickle')

In [22]:
len(X_train_sentence)

1333

In [23]:
len(X_test_sentence)

667

### Preprocessing
#### Count Vectorizer

In [24]:
# Count Vectorizer on rel,unrel dataset
# Question : Why rel/unrel? Because it trained as the first step? 
# Any advantages on more vocabulary?

token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

# whole imdb corpus
X_train_original_bow = tf_vectorizer.fit_transform(X_train_original)
X_test_original_bow = tf_vectorizer.transform(X_test_original)

# rel/unrel sentence
X_train_sentence_bow = tf_vectorizer.transform(X_train_sentence)
X_test_sentence_bow = tf_vectorizer.transform(X_test_sentence)

# neg/pos sentence
X_train_np_bow = tf_vectorizer.transform(X_train_np_sentence)
X_test_np_bow = tf_vectorizer.transform(X_test_np_sentence) 

words = tf_vectorizer.get_feature_names()
print(len(words))

26266


In [13]:
# tf_vectorizer.fit(X_train_sentence)
# print(len(tf_vectorizer.get_feature_names()))
# 546 for sentence dataset

#### TfIdf Vectorizer

In [14]:
tfidf_vect = TfidfVectorizer(lowercase=True, max_df=1.0, min_df=5, token_pattern=token)

X_train_original_tf = tfidf_vect.fit_transform(X_train_original)
X_test_original_tf = tfidf_vect.transform(X_test_original)

# rel/unrel sentence
X_train_sentence_tf = tfidf_vect.transform(X_train_sentence)
X_test_sentence_tf = tfidf_vect.transform(X_test_sentence)



# neg/pos sentence
X_train_np_tf = tfidf_vect.transform(X_train_np_sentence)
X_test_np_tf = tfidf_vect.transform(X_test_np_sentence) 

words = tfidf_vect.get_feature_names()
print(len(words))

26266


In [15]:
print(X_train_sentence_tf[0,:])

  (0, 25572)	0.16648658547419504
  (0, 24629)	0.44979268693691304
  (0, 23418)	0.15287236514401725
  (0, 20922)	0.19081584193860818
  (0, 16602)	0.27256102198457793
  (0, 16323)	0.0798474583391674
  (0, 8846)	0.46566203926716615
  (0, 3805)	0.517173250795398
  (0, 2293)	0.3124435255035184
  (0, 1583)	0.2181575577087448


### Train A [rel,unrel] classifier

In [6]:
# Okay... Using the function makes me more overwhelmed. Let's do it manually.


random_state = 42
C = 1

clf_A = LogisticRegression(random_state=random_state, C=C)
clf_A.fit(X_train_sentence_bow, y_train_sentence)

y_predict = clf_A.predict(X_test_sentence_bow)

print('--------------')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_A.score(X_train_sentence_bow, y_train_sentence)))
print('Test:\t%.5f ' %(clf_A.score(X_test_sentence_bow, y_test_sentence)))
    
print(classification_report(y_test_sentence,y_predict))

--------------
C=1.00
--------------
Accuracy
Train:	0.98425 
Test:	0.75862 
             precision    recall  f1-score   support

        0.0       0.81      0.73      0.77       363
        1.0       0.71      0.79      0.75       304

avg / total       0.76      0.76      0.76       667



### Train 1 [+,-] classifier

In [7]:
# using whole corpus
clf_1_i = LogisticRegression(random_state=random_state, C=C)
clf_1_i.fit(X_train_original_bow, y_train_original)

# using the [+/-] sentence

clf_1_j = LogisticRegression(random_state=random_state, C=C)
clf_1_j.fit(X_train_np_bow, y_train_np_sentence)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
# Test baseline

y_predict = clf_1_i.predict(X_test_original_bow)

print('--------------')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_1_i.score(X_train_original_bow, y_train_original)))
print('Test:\t%.5f ' %(clf_1_i.score(X_test_original_bow, y_test_original)))
    
print(classification_report(y_test_original,y_predict))

--------------
C=1.00
--------------
Accuracy
Train:	0.99672 
Test:	0.87304 
             precision    recall  f1-score   support

          0       0.87      0.88      0.87     12500
          1       0.88      0.87      0.87     12500

avg / total       0.87      0.87      0.87     25000



In [37]:
# Test baseline

y_predict = clf_1_j.predict(X_test_np_bow)

print('--------------')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_1_j.score(X_train_np_bow, y_train_np_sentence)))
print('Test:\t%.5f ' %(clf_1_j.score(X_test_np_bow, y_test_np_sentence)))
    
print(classification_report(y_test_np_sentence,y_predict))

--------------
C=1.00
--------------
Accuracy
Train:	0.99099 
Test:	0.76946 
             precision    recall  f1-score   support

        0.0       0.79      0.72      0.75       163
        1.0       0.75      0.82      0.78       171

avg / total       0.77      0.77      0.77       334



### Now the real deal

In [32]:
# Build an accuracy function excluding those -1

def nested_classifier(clf_A, clf_1_i, X_test_original, tf, threshold):
    y_pred_i = []
#     y_pred_j = []
    
    for ind, corpus in enumerate(X_test_original):
        '''
        Breakdown the corpus into sentence and transform into bag-of-words
        '''
        sentence_set = tf.transform(TextBlob(corpus).raw_sentences)

        '''
        related sentence classifier
        '''
        y_A_proba = clf_A.predict_proba(sentence_set)
        mu, mr = np.argmax(y_A_proba, axis=0)

        '''
        +/- classifier
        '''
        if y_A_proba[mr,1] > threshold:
            y_i_proba = clf_1_i.predict_proba(sentence_set[mr])
            y_pred_i.append(np.argmax(y_i_proba))

#             y_j_proba = clf_1_j.predict_proba(sentence_set[mr])
#             y_pred_j.append(np.argmax(y_j_proba))
        else:
            y_pred_i.append(-1)
#             y_pred_j.append(-1)
            continue
            
#     return np.array(y_pred_i), np.array(y_pred_j)
    return np.asarray(y_pred_i)

In [18]:
def rejection_rate(y):
    return np.sum(y==-1)/len(y)

def accuracy(y, y_pred):
    return np.sum(y_pred==y)/(np.sum(y_pred==1) + np.sum(y_pred==0))

### Logistic Regression

In [33]:
threshold = np.arange(0.5, 1, 0.05)
threshold = np.append(threshold, [0.96, 0.97, 0.98, 0.99])

In [36]:
# cv

lr_relevant = LogisticRegression(random_state=random_state)
lr_relevant.fit(X_train_sentence_bow, y_train_sentence)

lr_sentiment = LogisticRegression(random_state=random_state)
lr_sentiment.fit(X_train_original_bow, y_train_original)


lr_pred = []
for t in threshold :
    y_pred = nested_classifier(lr_relevant, lr_sentiment, X_test_original, tf_vectorizer, t)
    lr_pred.append(y_pred)
    print('%.2f \t %d \t %.5f \t %.5f' %(t, np.sum(y_pred==-1), rejection_rate(y_pred), accuracy(y_test_original, y_pred)))

0.50 	 483 	 0.01932 	 0.75911
0.55 	 611 	 0.02444 	 0.75956
0.60 	 786 	 0.03144 	 0.76014
0.65 	 1037 	 0.04148 	 0.76117
0.70 	 1380 	 0.05520 	 0.76194
0.75 	 1888 	 0.07552 	 0.76315
0.80 	 2621 	 0.10484 	 0.76460
0.85 	 3811 	 0.15244 	 0.76927
0.90 	 5975 	 0.23900 	 0.77593
0.95 	 10408 	 0.41632 	 0.78570
0.96 	 11884 	 0.47536 	 0.78614
0.97 	 13749 	 0.54996 	 0.79042
0.98 	 16057 	 0.64228 	 0.79582
0.99 	 19392 	 0.77568 	 0.80118


In [49]:
# tfidf

lr_relevant_tfidf = LogisticRegression(random_state=random_state)
lr_relevant_tfidf.fit(X_train_sentence_tf, y_train_sentence)

lr_sentiment_tfidf = LogisticRegression(random_state=random_state)
lr_sentiment_tfidf.fit(X_train_original_tf, y_train_original)


lr_pred_tf = []
for t in threshold :
    y_pred = nested_classifier(lr_relevant_tfidf, lr_sentiment_tfidf, X_test_original, tfidf_vect, t)
    lr_pred_tf.append(y_pred)
    print('%.2f \t %d \t %.5f \t %.5f' %(t, np.sum(y_pred==-1), rejection_rate(y_pred), accuracy(y_test_original, y_pred)))

0.50 	 784 	 0.03136 	 0.77994
0.55 	 2069 	 0.08276 	 0.78204
0.60 	 4553 	 0.18212 	 0.78740
0.65 	 7959 	 0.31836 	 0.79514
0.70 	 11997 	 0.47988 	 0.80328
0.75 	 16051 	 0.64204 	 0.81249
0.80 	 19425 	 0.77700 	 0.82422
0.85 	 22095 	 0.88380 	 0.83236
0.90 	 23859 	 0.95436 	 0.82559
0.95 	 24786 	 0.99144 	 0.82710
0.96 	 24865 	 0.99460 	 0.83704
0.97 	 24934 	 0.99736 	 0.83333
0.98 	 24978 	 0.99912 	 1.00000
0.99 	 25000 	 1.00000 	 nan


  """


### MultinomialNB

In [37]:
# CV

from sklearn.naive_bayes import MultinomialNB

mnb_relevant = MultinomialNB()
mnb_relevant.fit(X_train_sentence_bow, y_train_sentence)

mnb_sentiment = MultinomialNB()
mnb_sentiment.fit(X_train_original_bow, y_train_original)

mnb_pred = []
for t in threshold :
    y_pred = nested_classifier(mnb_relevant, mnb_sentiment, X_test_original, tf_vectorizer, t)
    mnb_pred.append(y_pred)
    print('%.2f \t %d \t %.5f \t %.5f' %(t, np.sum(y_pred==-1), rejection_rate(y_pred), accuracy(y_test_original, y_pred)))

0.50 	 344 	 0.01376 	 0.76071
0.55 	 426 	 0.01704 	 0.76137
0.60 	 487 	 0.01948 	 0.76168
0.65 	 560 	 0.02240 	 0.76211
0.70 	 669 	 0.02676 	 0.76253
0.75 	 798 	 0.03192 	 0.76312
0.80 	 989 	 0.03956 	 0.76361
0.85 	 1337 	 0.05348 	 0.76457
0.90 	 1862 	 0.07448 	 0.76549
0.95 	 3195 	 0.12780 	 0.76803
0.96 	 3697 	 0.14788 	 0.76900
0.97 	 4423 	 0.17692 	 0.76940
0.98 	 5685 	 0.22740 	 0.77080
0.99 	 8148 	 0.32592 	 0.77694


In [50]:
# tfidf

mnb_relevant_tf = MultinomialNB()
mnb_relevant_tf.fit(X_train_sentence_tf, y_train_sentence)

mnb_sentiment_tf = MultinomialNB()
mnb_sentiment_tf.fit(X_train_original_tf, y_train_original)

mnb_pred_tf = []
for t in threshold :
    y_pred = nested_classifier(mnb_relevant_tf, mnb_sentiment_tf, X_test_original, tfidf_vect, t)
    mnb_pred_tf.append(y_pred)
    print('%.2f \t %d \t %.5f \t %.5f' %(t, np.sum(y_pred==-1), rejection_rate(y_pred), accuracy(y_test_original, y_pred)))

0.50 	 112 	 0.00448 	 0.76909
0.55 	 346 	 0.01384 	 0.76961
0.60 	 976 	 0.03904 	 0.77085
0.65 	 2544 	 0.10176 	 0.77467
0.70 	 5624 	 0.22496 	 0.78102
0.75 	 10199 	 0.40796 	 0.79278
0.80 	 15371 	 0.61484 	 0.80870
0.85 	 20190 	 0.80760 	 0.84304
0.90 	 23562 	 0.94248 	 0.88595
0.95 	 24911 	 0.99644 	 0.96629
0.96 	 24988 	 0.99952 	 1.00000
0.97 	 24999 	 0.99996 	 1.00000


  """


0.98 	 25000 	 1.00000 	 nan
0.99 	 25000 	 1.00000 	 nan


### Baseline

In [19]:
from sklearn.naive_bayes import MultinomialNB

base_lr = LogisticRegression(random_state=random_state)
base_lr.fit(X_train_original_bow, y_train_original)

print(base_lr.score(X_test_original_bow, y_test_original))

base_mnb = MultinomialNB()
base_mnb.fit(X_train_original_bow, y_train_original)

print(base_mnb.score(X_test_original_bow, y_test_original))

0.87304
0.83476


In [20]:
base_lr_tf = LogisticRegression(random_state=random_state)
base_lr_tf.fit(X_train_original_tf, y_train_original)

print(base_lr_tf.score(X_test_original_tf, y_test_original))

base_mnb_tf = MultinomialNB()
base_mnb_tf.fit(X_train_original_tf, y_train_original)

print(base_mnb_tf.score(X_test_original_tf, y_test_original))

0.88792
0.83408


### Neural Nets

In [40]:
# np.savetxt('cos_sim.csv', np.around(cos_sim,2), delimiter=',')
print(C)

1


In [67]:
corp = "I have been following Marvel’s movies, until this Black Panther movie came out. To be honest, it was a good movie. However, it was boring in the middle. The scenario contains mainstream storyline. I feel like I am watching a Disney movie rather than Marvel. Lack of action, but still very entertaining. I watched it on my flight during the turbulence. At least, it did makes me distracted from the motion sickness. Good movie, recommended to all ages."

sent = tf_vectorizer.transform(TextBlob(corp).raw_sentences)


y_A_proba = clf_A.predict_proba(sent)
mu, mr = np.argmax(y_A_proba, axis=0)

for i in range(y_A_proba.shape[0]):
    print(TextBlob(corp).raw_sentences[i], y_A_proba[i])

print()
print(TextBlob(corp).raw_sentences[mr], y_A_proba[mr])
print(TextBlob(corp).raw_sentences[mu], y_A_proba[mu])

y_proba = clf_1_i.predict_proba(sent[mr])
y_proba

I have been following Marvel’s movies, until this Black Panther movie came out. [0.41722985 0.58277015]
To be honest, it was a good movie. [0.03844995 0.96155005]
However, it was boring in the middle. [0.3612379 0.6387621]
The scenario contains mainstream storyline. [0.66960472 0.33039528]
I feel like I am watching a Disney movie rather than Marvel. [0.2986911 0.7013089]
Lack of action, but still very entertaining. [0.04510595 0.95489405]
I watched it on my flight during the turbulence. [0.86232269 0.13767731]
At least, it did makes me distracted from the motion sickness. [0.7270355 0.2729645]
Good movie, recommended to all ages. [0.03508099 0.96491901]

Good movie, recommended to all ages. [0.03508099 0.96491901]
I watched it on my flight during the turbulence. [0.86232269 0.13767731]


array([[0.21747813, 0.78252187]])

In [65]:
corps = list([corp, corp])
x_corp = tf_vectorizer.transform(corps)
# x_corp.shape
y = clf_1_i.predict_proba(x_corp)

print(y)

[[0.19116551 0.80883449]
 [0.19116551 0.80883449]]


Now, let's take a look on the words

In [24]:
human_vocab = []
filelist = glob.glob(os.path.join("../data/human-provided-phrases", "*.txt"))
for file in filelist:
    with open(file) as f:
        for line in f:
            human_vocab.append(line.split('\t')[0])

In [25]:
len(human_vocab)

['2/10',
 '3/10',
 '7/10',
 '4/10',
 '8/10',
 '1/10',
 'unwatchable',
 'incoherent',
 'stinker',
 'mst3k',
 'unfunny',
 'waste',
 '9/10',
 'flawless',
 'atrocious',
 'pointless',
 'horrid',
 'superbly',
 'redeeming',
 '10/10',
 'laughable',
 'drivel',
 'worst',
 'perfection',
 'lousy',
 'awful',
 'wasting',
 'remotely',
 'poorly',
 'sucks',
 'captures',
 'wonderfully',
 'existent',
 'lame',
 'boredom',
 'uninspired',
 'miserably',
 'refreshing',
 'amateurish',
 'unintentional',
 'pathetic',
 'eathtaking',
 'appalling',
 'uninteresting',
 'unconvincing',
 'suck',
 'delightful',
 'idiotic',
 'wasted',
 'beautifully',
 'underrated',
 'crap',
 'stupidity',
 'dreadful',
 'tedious',
 'sadness',
 'horrible',
 'insulting',
 'dire',
 'mess',
 'superb',
 'gripping',
 'garbage',
 'timeless',
 'embarrassing',
 'badly',
 'insult',
 'terrible',
 'wooden',
 'touching',
 'worse',
 'cardboard',
 'unforgettable',
 'extraordinary',
 'inept',
 'stupid',
 'pile',
 'worthless',
 'ashamed',
 'junk',
 'illian

In [32]:
human_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=5, binary=True, token_pattern=token, vocabulary=human_vocab)
human_vectorizer.set_params(ngram_range=(1,1))

# whole imdb corpus
X_tr_org_hum_bow = human_vectorizer.fit_transform(X_train_original)
X_te_org_hum_bow = human_vectorizer.transform(X_test_original)

# rel/unrel sentence
X_tr_sent_hum_bow = human_vectorizer.transform(X_train_sentence)
X_te_sent_hum_bow = human_vectorizer.transform(X_test_sentence)


# neg/pos sentence
X_tr_np_hum_bow = human_vectorizer.transform(X_train_np_sentence)
X_te_np_hum_bow = human_vectorizer.transform(X_test_np_sentence) 


In [33]:
len(human_vectorizer.get_feature_names())

441

In [34]:
del clf_A
del clf_1_i
del clf_1_j

In [35]:
# Okay... Using the function makes me more overwhelmed. Let's do it manually.


random_state = 42
C = 1

clf_A = LogisticRegression(random_state=random_state, C=C)
clf_A.fit(X_tr_sent_hum_bow, y_train_sentence)

y_predict = clf_A.predict(X_te_sent_hum_bow)

print('--------------')
print('CLF A')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_A.score(X_tr_sent_hum_bow, y_train_sentence)))
print('Test:\t%.5f ' %(clf_A.score(X_te_sent_hum_bow, y_test_sentence)))
    
print(classification_report(y_test_sentence,y_predict))

# using whole corpus
clf_1_i = LogisticRegression(random_state=random_state, C=C)
clf_1_i.fit(X_tr_org_hum_bow, y_train_original)

y_predict = clf_1_i.predict(X_te_org_hum_bow)

print('--------------')
print('CLF A-1-i')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_1_i.score(X_tr_org_hum_bow, y_train_original)))
print('Test:\t%.5f ' %(clf_1_i.score(X_te_org_hum_bow, y_test_original)))
    
print(classification_report(y_test_original,y_predict))

# using the [+/-] sentence

clf_1_j = LogisticRegression(random_state=random_state, C=C)
clf_1_j.fit(X_tr_np_hum_bow, y_train_np_sentence)

y_predict = clf_1_i.predict(X_te_np_hum_bow)

print('--------------')
print('CLF A-1-j')
print('C=%.2f' %(C))
print('--------------')
print('Accuracy')
print('Train:\t%.5f ' %(clf_1_i.score(X_tr_np_hum_bow, y_train_np_sentence)))
print('Test:\t%.5f ' %(clf_1_i.score(X_te_np_hum_bow, y_test_np_sentence)))
    
print(classification_report(y_test_np_sentence,y_predict))

--------------
CLF A
C=1.00
--------------
Accuracy
Train:	0.56864 
Test:	0.62069 
             precision    recall  f1-score   support

        0.0       0.59      0.98      0.74       363
        1.0       0.90      0.19      0.31       304

avg / total       0.73      0.62      0.54       667

--------------
CLF A-1-i
C=1.00
--------------
Accuracy
Train:	0.79900 
Test:	0.79280 
             precision    recall  f1-score   support

          0       0.84      0.72      0.78     12500
          1       0.76      0.86      0.81     12500

avg / total       0.80      0.79      0.79     25000

--------------
CLF A-1-j
C=1.00
--------------
Accuracy
Train:	0.67117 
Test:	0.65269 
             precision    recall  f1-score   support

        0.0       0.96      0.30      0.46       163
        1.0       0.60      0.99      0.74       171

avg / total       0.77      0.65      0.60       334



In [36]:
threshold = np.arange(0.5, 1, 0.05)
threshold = np.append(threshold, [0.96, 0.97, 0.98, 0.99])

print('t \t ~rel \t rr \t acc_i \t acc_j')
for t in threshold : 
    y_pred_i, y_pred_j = nested_classifier(X_test_original, human_vectorizer, t)
    print('%.2f \t %d \t %.5f \t %.5f \t %.5f' %(t,
                                                 np.sum(y_pred_i==-1),
                                                 rejection_rate(y_pred_i), 
                                                 accuracy(y_test_original, y_pred_i), 
                                                 accuracy(y_test_original,y_pred_j)))

t 	 ~rel 	 rr 	 acc_i 	 acc_j
0.50 	 12535 	 0.50140 	 0.83730 	 0.82631
0.55 	 12642 	 0.50568 	 0.83743 	 0.82643
0.60 	 14326 	 0.57304 	 0.84720 	 0.84289
0.65 	 14877 	 0.59508 	 0.85024 	 0.84807
0.70 	 16277 	 0.65108 	 0.86278 	 0.86094
0.75 	 19218 	 0.76872 	 0.89640 	 0.89502
0.80 	 21225 	 0.84900 	 0.90172 	 0.90146
0.85 	 22962 	 0.91848 	 0.90922 	 0.90726
0.90 	 24441 	 0.97764 	 0.93918 	 0.93918
0.95 	 24849 	 0.99396 	 0.98675 	 0.99338
0.96 	 24893 	 0.99572 	 1.00000 	 1.00000
0.97 	 24947 	 0.99788 	 1.00000 	 1.00000
0.98 	 24974 	 0.99896 	 1.00000 	 1.00000
0.99 	 24992 	 0.99968 	 1.00000 	 1.00000


In [37]:
print(C)

1


In [38]:
len(human_vocab)

441