In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

In [2]:
imdb = pd.read_csv('imdb_labelled.txt', sep='\t',header=None)

In [3]:
imdb.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
imdb.shape

(748, 2)

In [5]:
amazon = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)
yelp = pd.read_csv('yelp_labelled.txt', sep='\t', header=None)

In [6]:
df = pd.DataFrame()

In [7]:
df = df.append(imdb)

In [8]:
df.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [9]:
df = df.append(amazon)

In [10]:
df.shape

(1748, 2)

In [11]:
df = df.append(yelp)

In [12]:
df.shape

(2748, 2)

In [13]:
df.columns = ['review','sentiment']

In [14]:
df_tokens = []
for i in range(len(df)):
    df_tokens.append(word_tokenize(df['review'].iloc[i].lower()))

In [15]:
print(df_tokens[0])

['a', 'very', ',', 'very', ',', 'very', 'slow-moving', ',', 'aimless', 'movie', 'about', 'a', 'distressed', ',', 'drifting', 'young', 'man', '.']


In [16]:
eng_stopwords = stopwords.words('english')

In [17]:
eng_stopwords.extend([',','.',"'"])

In [18]:
words = []
for list_1 in df_tokens:
    main_words = []
    for word in list_1:
        if word not in eng_stopwords:
            main_words.append(word)
    words.append(main_words)

In [19]:
words[0]

['slow-moving', 'aimless', 'movie', 'distressed', 'drifting', 'young', 'man']

In [20]:
wnet = WordNetLemmatizer()

In [21]:
for i in range(len(words)):
    for j in range(len(words[i])):
        lemm = wnet.lemmatize(words[i][j], pos='v')
        words[i][j] = lemm

In [22]:
words[0]

['slow-moving', 'aimless', 'movie', 'distress', 'drift', 'young', 'man']

In [23]:
for i in range(len(words)):
    words[i] = ' '.join(words[i])

In [24]:
tfidf = TfidfVectorizer()

In [25]:
vect = tfidf.fit_transform(words)

In [26]:
vect.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
x_train,x_test,y_train,y_test = train_test_split(vect,df['sentiment'],test_size=0.25)

In [28]:
x_train.shape

(2061, 4372)

In [29]:
y_train.shape

(2061,)

In [30]:
reg = LogisticRegression()

In [31]:
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [32]:
y_pred = reg.predict(x_test)

In [33]:
accuracy_score(y_test, y_pred)

0.7903930131004366

In [34]:
confusion_matrix(y_test, y_pred)

array([[285,  62],
       [ 82, 258]], dtype=int64)

In [51]:
def pred(rev):
    words = []
    tokens = []
    text = [rev]
    for i in range(len(text)):
        tokens.append(word_tokenize(text[i].lower()))
    for j in tokens:
        main_words = []
        for k in j:
            if k not in eng_stopwords:
                main_words.append(k)
        words.append(main_words)
    for l in range(len(words)):
        for m in range(len(words[l])):
            lemm = wnet.lemmatize(words[l][m], pos='v')
            words[l][m] = lemm
    for n in range(len(words)):
        words[n] = ' '.join(words[n])
    vect = tfidf.transform(words)
    w_arr = vect.toarray()
    pred = reg.predict(w_arr)
    if pred == [0]:
        print("Negative review")
    else:
        print("Positive review")
    


In [52]:
rev = "The best marvel movie ever.. It was epic!! You'll enjoy every moment of the 3hr2min movie. All I can say is.. We love you 3000 AVENGERS!!"

In [53]:
pred(rev)

Positive review
