In [1]:
# import required libraries
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

In [2]:
# Importing data
review = pd.read_csv("imdb_labelled.txt", sep='\t', names=['message','label'])
print(review.shape)
review.head()

(748, 2)


Unnamed: 0,message,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [3]:
# let's check if the dataset is imbalanced
review.groupby('label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,362,361,Not recommended.,2
1,386,384,Definitely worth checking out.,2


In [4]:
# let's make a function to remove punctuations and stopwords
# we will use both stemming & lemmetization to see which performs better, and what's the difference

def data_cleaning_func(text):
    text = text.lower()
    text = [word for word in text if word not in string.punctuation]
    text = ''.join(text)
    text = [PorterStemmer().stem(word) for word in text.split(' ') if word not in stopwords.words('english')]
#     text = [WordNetLemmatizer().lemmatize(word) for word in text.split(' ') if word not in stopwords.words('english')]
    text = ' '.join(text)
    return text

In [5]:
# lets test our function
# sent = 'my name is Ashwini Kumar!! its fun learning python. how are u doing?? keeping good??'
# data_cleaning_func(sent)

# review.head()['message'].apply(data_cleaning_func)

In [6]:
x = review['message'].apply(data_cleaning_func)
print(type(x))
x

<class 'pandas.core.series.Series'>


0        slowmov aimless movi distress drift young man  
1      sure lost  flat charact audienc nearli half wa...
2      attempt arti black  white clever camera angl m...
3                              littl music anyth speak  
4      best scene movi gerardo tri find song keep run...
                             ...                        
743              got bore watch jessic lang take cloth  
744    unfortun virtu film product work lost regrett ...
745                                     word embarrass  
746                                         except bad  
747                insult one intellig huge wast money  
Name: message, Length: 748, dtype: object

In [7]:
# Let's experiment with word vectorizing techniques

# Bog of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer().fit(x)
x_bow = cv.transform(x).toarray()

# Tf-Idf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
x_tfidf = tfidf.fit_transform(x).toarray()

In [8]:
print(x_bow.shape)
x_bow

(748, 2543)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
print(x_tfidf.shape)
x_tfidf

(748, 2543)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
y = review['label']
print(y.shape)
y

(748,)


0      0
1      0
2      0
3      0
4      1
      ..
743    0
744    0
745    0
746    0
747    0
Name: label, Length: 748, dtype: int64

In [11]:
# Let's split our train-test sets
from sklearn.model_selection import train_test_split
x_bow_train, x_bow_test, y_train, y_test = train_test_split(x_bow,y, test_size=0.2)
x_tfidf_train, x_tfidf_test, y_train, y_test = train_test_split(x_tfidf,y, test_size=0.2)

In [12]:
from sklearn.naive_bayes import MultinomialNB
model1 = MultinomialNB().fit(x_bow_train,y_train)
model2 = MultinomialNB().fit(x_tfidf_train,y_train)

In [13]:
y_pred1 = model1.predict(x_bow_test)
y_pred2 = model2.predict(x_tfidf_test)

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

print('BAG OF WORDS MODEL')
print('Confusion matrix: \n', confusion_matrix(y_test,y_pred1))
print('Accuracy: ', accuracy_score(y_test,y_pred1))
print('f1 score: ', f1_score(y_test,y_pred1))

print()

print('TF-IDF MODEL')
print('Confusion matrix: \n', confusion_matrix(y_test,y_pred2))
print('Accuracy: ', accuracy_score(y_test,y_pred2))
print('f1 score: ', f1_score(y_test,y_pred2))

BAG OF WORDS MODEL
Confusion matrix: 
 [[33 49]
 [39 29]]
Accuracy:  0.41333333333333333
f1 score:  0.3972602739726027

TF-IDF MODEL
Confusion matrix: 
 [[53 29]
 [ 6 62]]
Accuracy:  0.7666666666666667
f1 score:  0.7798742138364779


In [15]:
# Let's take random sentences and check sentiments

def sentiment_func(sent):
    t = [data_cleaning_func(sent)]
    t = tfidf.transform(t)
    array = model2.predict(t)
    
    if array[0] == 0:
        sentiment = 'Negative sentiment'
    else:
        sentiment = 'Positive Sentiment'
    
    print('Sent 1:', sent, '   Sentiment:', sentiment)

    
import numpy as np
sent1 = 'movie was terrible'
sent2 = 'food was great'
sent3 = 'he is a good person'

sentiment_func(sent1)
sentiment_func(sent2)
sentiment_func(sent3)

Sent 1: movie was terrible    Sentiment: Negative sentiment
Sent 1: food was great    Sentiment: Positive Sentiment
Sent 1: he is a good person    Sentiment: Positive Sentiment


### Conclusion

- tf-Idf is givng better performance than bag of words. It is expected too, as tf-idf weights the words according to their importance.