In [116]:

import sklearn
import nltk
nltk.download('punkt')
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package punkt to /Users/zileto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


data represents the features, which are the variables that help the model learn how to predict. target includes the actual labels.

In [65]:
m_dir =  r'/Users/Zileto/ipython-in-depth/notebooks/movie_reviews'

In [40]:
# Here you load all the .txt files from movie_reviews to m_train
m_train = load_files(m_dir, shuffle=True)

In [41]:
len(m_train.data)

2000

In [42]:
# .target_names outputs a list of classes in the training set 
# m_train, in this case negative & positive
m_train.target_names

['neg', 'pos']

In [48]:
#You can access the content of a particular file in the
#training set as follows:

m_train.data[0][:500]

#[:500] outputs only the first 500 characters.

b"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . \nit's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? \nonce again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . \nin this so cal"

In [49]:
m_train.filenames[0]

'/Users/Zileto/ipython-in-depth/notebooks/movie_reviews/neg/cv405_21868.txt'

In [62]:
#target is the variable you want to predict, i.e. 'neg'=0 
#or 'pos'=1
m_train.target[0]

0

In [61]:
m_train.target[1]

1

In [130]:
# Initialize m_vector object, a CoutVectorizer to 
#use NLTK's tokenizer instead of 
# its default one (which ignores punctuation and stopwords). 
# Minimum document frequency set to 1. 
m_vec = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize)

# turn movie train data into a vector - 
#sparse vector of word frequency counts
m_counts = m_vec.fit_transform(m_train.data)

In [134]:
m_counts.shape 
# dimension of 2000 (document count)
# by 25280(# of unique wordssents_counts.toarray())

(2000, 25280)

In [145]:
m_vec.vocabulary_.get('?') #which index is assigned to '?'

411

In [147]:
m_vec.vocabulary_.get('seagal')

19657

In [138]:
m_counts.toarray()
# analysing the first line (document 1) of the array: 
# - there is 0 tokens with index 0, there are 2 
# tokens with index 2 etc.

array([[0, 0, 2, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

TF-IDF values: converts raw frequency counnts (occurences) of tokens to TF-IDF (Term Frequency -- Inverse Document Frequency) values


In [148]:
tfidf_transformer = TfidfTransformer()
m_tfidf = tfidf_transformer.fit_transform(m_counts)
# raw counts have been normalized against document length
# terms that are found across many docs are weighted down

# fit(..) method to fit our estimator to the data
#transform(..) method to transform our count-matrix 
#to a tf-idf representation.

In [151]:
m_tfidf.toarray()

array([[0.        , 0.        , 0.03844965, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [153]:
m_tfidf.shape

(2000, 25280)

Training and testing a Naive Bayes classifier

In this section I will build a classifier using MultinomialNB

In [160]:
#First I split the data 'movies' into trainig and test sets
from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test =train_test_split(m_tfidf,
m_train.target, test_size=0.20, random_state = 12)

In [165]:
#Here I will train a Multimoda Niave Bayes classifier"
clf = MultinomialNB().fit(docs_train, y_train)

In [170]:
# Predicting the Test set results, find accuracy by comparing 
# the predicted and test value
y_pred = clf.predict(docs_test)
sklearn.metrics.accuracy_score(y_test,y_pred)

0.82

In [181]:
# Making the Confusion Matrix
#from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(y_test, y_pred)
#cm

Testing on a random set 'reviews_new'


In [180]:

reviews_new = ['This movie was excellent', 'Absolute joy ride', 
            'Steven Seagal was terrible', 'Steven Seagal shined through.', 
              'This was certainly a movie', 'Two thumbs up', 'I fell asleep halfway through', 
              "We can't wait for the sequel!!", '!', '?', 'I cannot recommend this highly enough', 
              'instant classic.', 'Steven Seagal was amazing. His performance was Oscar-worthy.']
reviews_new_counts = m_vec.transform(reviews_new)
reviews_new_tfidf = tfidf_transformer.transform(reviews_new_counts)

In [178]:
# have classifier make a prediction
predicted = clf.predict(reviews_new_tfidf)

In [179]:
for review, category in zip(reviews_new, pred):
    print('%r  >>  %s' % (review, m_train.target_names[category]))

'This movie was excellent'  >>  pos
'Absolute joy ride'  >>  pos
'Steven Seagal was terrible'  >>  neg
'Steven Seagal shined through.'  >>  neg
'This was certainly a movie'  >>  neg
'Two thumbs up'  >>  neg
'I fell asleep halfway through'  >>  neg
"We can't wait for the sequel!!"  >>  neg
'!'  >>  neg
'?'  >>  neg
'I cannot recommend this highly enough'  >>  pos
'instant classic.'  >>  pos
'Steven Seagal was amazing. His performance was Oscar-worthy.'  >>  neg
