In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import nltk

%matplotlib inline


In [2]:
datasets = pd.read_csv("/home/shrestha/mPercept/Natural Language Processing/Sentiment Analysis/IMDB reviews/imdb_reviews.csv", delimiter = '\t', quoting = 3, header = None)

                       

In [3]:
datasets.columns = ['reviews', 'Likes']


In [4]:
datasets

Unnamed: 0,reviews,Likes
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
5,"The rest of the movie lacks art, charm, meanin...",0
6,Wasted two hours.,0
7,Saw the movie today and thought it was a good ...,1
8,A bit predictable.,0
9,Loved the casting of Jimmy Buffet as the scien...,1


In [5]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [6]:
ps = PorterStemmer() #object initialization for stemmer
word_net_lemmatizer = WordNetLemmatizer() #object initilization for lemmatizer

In [7]:
corpus = [] #Initializing empty list

In [8]:
#Cleaning the text with stemmer
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', datasets['reviews'][i])
    review = review.lower()
    review = review.split()
    review = [word for word in review if not word in set(stopwords.words('english'))]
    review = [ps.stem(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
#Cleaning the text with lemmatizer
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', datasets['reviews'][i])
    review = review.lower()
    review = review.split()
    review = [word for word in review if not word in set(stopwords.words('english'))]
    review = [word_net_lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus

['slow move aimless movi distress drift young man',
 'sure lost flat charact audienc nearli half walk',
 'attempt arti black white clever camera angl movi disappoint becam even ridicul act poor plot line almost non exist',
 'littl music anyth speak',
 'best scene movi gerardo tri find song keep run head',
 'rest movi lack art charm mean empti work guess empti',
 'wast two hour',
 'saw movi today thought good effort good messag kid',
 'bit predict',
 'love cast jimmi buffet scienc teacher',
 'babi owl ador',
 'movi show lot florida best made look appeal',
 'song best muppet hilari',
 'cool',
 'right case movi deliv everyth almost right face',
 'averag act main person low budget clearli see',
 'review long overdu sinc consid tale two sister singl greatest film ever made',
 'put gem movi term screenplay cinematographi act post product edit direct aspect film make',
 'practic perfect true masterpiec sea faux masterpiec',
 'structur film easili tightli construct histori cinema',
 'think fil

In [10]:
df = pd.DataFrame({'Review':corpus}) #creating a dataframe with column name as review 

In [11]:
df


Unnamed: 0,Review
0,slow move aimless movi distress drift young man
1,sure lost flat charact audienc nearli half walk
2,attempt arti black white clever camera angl mo...
3,littl music anyth speak
4,best scene movi gerardo tri find song keep run...
5,rest movi lack art charm mean empti work guess...
6,wast two hour
7,saw movi today thought good effort good messag...
8,bit predict
9,love cast jimmi buffet scienc teacher


In [12]:
from sklearn.feature_extraction.text import CountVectorizer


In [13]:
cv = CountVectorizer(max_features = 2300)
fit_corpus = cv.fit(corpus)
print(cv.get_feature_names())

['aailiyah', 'abandon', 'abil', 'abroad', 'absolut', 'abstrus', 'abysm', 'academi', 'accent', 'access', 'acclaim', 'accolad', 'accur', 'accus', 'achiev', 'achil', 'ackerman', 'act', 'action', 'actor', 'actress', 'actual', 'ad', 'adam', 'adapt', 'add', 'addit', 'admin', 'admir', 'admit', 'ador', 'adrift', 'adventur', 'advis', 'aerial', 'aesthet', 'affect', 'affleck', 'afraid', 'africa', 'afternoon', 'age', 'ago', 'agre', 'aimless', 'air', 'akasha', 'akin', 'alert', 'alexand', 'alik', 'allison', 'allow', 'almost', 'along', 'alongsid', 'alreadi', 'also', 'although', 'alway', 'amateurish', 'amaz', 'amazingli', 'america', 'american', 'among', 'amount', 'amus', 'anatomist', 'angel', 'angela', 'angelina', 'angl', 'angri', 'angu', 'anguish', 'anim', 'anita', 'ann', 'anniversari', 'annoy', 'anoth', 'anthoni', 'antithesi', 'anyon', 'anyth', 'anyway', 'apart', 'appal', 'appeal', 'appear', 'applaud', 'applaus', 'appreci', 'appropri', 'apt', 'argu', 'armageddon', 'armand', 'around', 'array', 'art',

In [14]:
transform_corpus = cv.transform(corpus) #X = cv.fit_transform(corpus).toarray()

In [15]:
transform_corpus_toarray = transform_corpus.toarray()

In [16]:
print(transform_corpus_toarray)
print(transform_corpus_toarray.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(1000, 2300)


In [17]:
X = transform_corpus_toarray

In [40]:
save_count_vectorizer = open("count_vectorizer.pickle", "wb")
pickle.dump(cv, save_count_vectorizer)
save_count_vectorizer.close()

In [18]:
X.shape

(1000, 2300)

In [19]:
y = datasets.iloc[:, 1].values


In [20]:
print(y)

[0 0 0 0 1 0 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1
 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1
 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 0 0 1 1
 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 0 1 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0
 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0
 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 1
 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 1 1 1 1 0 0 1 1 1 

In [21]:
from sklearn.cross_validation import train_test_split



In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [23]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB


In [24]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [25]:
classifier = MultinomialNB(alpha = 0.1)
classifier.fit(X_train, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [39]:
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [26]:
y_pred = classifier.predict(X_test)

In [27]:
from sklearn.metrics import confusion_matrix

In [28]:
cm = confusion_matrix(y_test, y_pred)

In [29]:
cm

array([[69, 17],
       [27, 87]])

In [30]:
accuracy = (71+64)/200 *100
print("The accuracy of the Gaussian naive bayes model using stemmer is : ", accuracy)

The accuracy of the Gaussian naive bayes model using stemmer is :  67.5


In [31]:
accuracy = (69+87)/200 *100
print("The accuracy of the model Multinomial naive bayes using stemmer is : ", accuracy)

The accuracy of the model Multinomial naive bayes using stemmer is :  78.0


In [32]:
import pickle


In [33]:
save_classifier = open("naivebayes.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [36]:
open_classifier = open("naivebayes.pickle", "rb")
classifier1 = pickle.load(open_classifier)
open_classifier.close()

In [37]:
y_pred11 = classifier1.predict(X_test)

In [38]:
y_pred

array([1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0])

In [None]:
y_pred

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve


In [None]:
score = roc_auc_score(y_test, y_pred)
score

In [None]:
#Spitting the datasets into train and test sets 
a_train, a_test, b_train, b_test = train_test_split(datasets['reviews'][:], datasets['Likes'][:], test_size = 0.20, random_state = 0)

training_sets = pd.DataFrame({'Training_review':a_train})
test_sets = pd.DataFrame({'Test_review' : a_test})

training_sets['Trainig_likes'] = b_train
test_sets['Test_likes'] = b_test

test_sets['Test_likes_predicted'] = y_pred

y_pred_chr = []

for i in range(0,200):
    if y_pred[i] == 0:
        y_pred_chr.append('Bad')
    else:
        y_pred_chr.append('Good')
    

y_pred_chr = pd.DataFrame(y_pred_chr)
test_sets['test_likes_predicted_chr'] = y_pred_chr


In [None]:
test_sets

# Tf-idf 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer().fit(corpus)

In [None]:
print(tfidf_vectorizer.get_feature_names())

In [None]:
tfidf_vectorizer_transform = tfidf_vectorizer.transform(a_test)

In [None]:
print(tfidf_vectorizer_transform.toarray())

In [None]:
pd.DataFrame(tfidf_vectorizer_transform.toarray(), columns = tfidf_vectorizer.get_feature_names())

# Visualization 

In [None]:
from nltk import FreqDist
from collections import Counter


In [None]:
corpus

In [None]:
#joining each term of the corpus
corpus_combine = ' '.join(corpus)

In [None]:
type(corpus_combine)

In [None]:
# simple tokenization
words = corpus_combine.split()
print(words)
frequency_map = Counter(words)

# for scatter plot
indices_max = 200 # max number of points to plot
Y = list(frequency_map.values())[:indices_max]
X = list(range(len(Y)))
words_plot = list(frequency_map.keys())[:indices_max]

In [None]:
plt.scatter(X,Y)

In [None]:
fig, ax = plt.subplots()
ax.scatter(X, Y)


In [None]:
# scatter plot with labelled point
fig, ax = plt.subplots(figsize = (15,15))
ax.scatter(X, Y)

for i, txt in enumerate(words_plot):
    ax.annotate(txt, (X[i],Y[i]))

In [None]:
# nltk tokenizer and frequency map# nltk  
tokens = nltk.word_tokenize(corpus_combine)
#print(tokens)
freq = FreqDist(tokens)
print(freq['i'])
freq.plot(25, cumulative=False)