In [2]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [3]:
imdb = pd.read_csv('imdb_labelled.txt',sep='\t',header = None)

In [4]:
imdb.shape

(748, 2)

In [5]:
imdb.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [6]:
amazon = pd.read_csv('amazon_cells_labelled.txt',sep='\t',header = None)
yelp = pd.read_csv('yelp_labelled.txt',sep='\t',header = None)


In [7]:
yelp.shape

(1000, 2)

In [8]:
df = pd.DataFrame()

In [9]:
df = pd.concat([imdb,amazon,yelp],ignore_index=True)

In [10]:
df.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [11]:
df.shape

(2748, 2)

In [12]:
df.columns = ['Review','Sentiment']

In [13]:
df.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [14]:
tokens = []
for i in range(len(df)):
    tokens.append(word_tokenize(df['Review'][i]))  

In [15]:
tokens

[['A',
  'very',
  ',',
  'very',
  ',',
  'very',
  'slow-moving',
  ',',
  'aimless',
  'movie',
  'about',
  'a',
  'distressed',
  ',',
  'drifting',
  'young',
  'man',
  '.'],
 ['Not',
  'sure',
  'who',
  'was',
  'more',
  'lost',
  '-',
  'the',
  'flat',
  'characters',
  'or',
  'the',
  'audience',
  ',',
  'nearly',
  'half',
  'of',
  'whom',
  'walked',
  'out',
  '.'],
 ['Attempting',
  'artiness',
  'with',
  'black',
  '&',
  'white',
  'and',
  'clever',
  'camera',
  'angles',
  ',',
  'the',
  'movie',
  'disappointed',
  '-',
  'became',
  'even',
  'more',
  'ridiculous',
  '-',
  'as',
  'the',
  'acting',
  'was',
  'poor',
  'and',
  'the',
  'plot',
  'and',
  'lines',
  'almost',
  'non-existent',
  '.'],
 ['Very', 'little', 'music', 'or', 'anything', 'to', 'speak', 'of', '.'],
 ['The',
  'best',
  'scene',
  'in',
  'the',
  'movie',
  'was',
  'when',
  'Gerardo',
  'is',
  'trying',
  'to',
  'find',
  'a',
  'song',
  'that',
  'keeps',
  'running',
  't

In [223]:
stopwordsList = stopwords.words("english")
stopwordsList.extend ([",",".","-","!"])

In [224]:
wordsList =[]
for tokenList in tokens:
    words = []
    for word in tokenList:
        if word.lower() not in stopwordsList:
            words.append(word.lower())
    wordsList.append(words)

In [225]:
print(wordsList[:5])

[['slow-moving', 'aimless', 'movie', 'distressed', 'drifting', 'young', 'man'], ['sure', 'lost', 'flat', 'characters', 'audience', 'nearly', 'half', 'walked'], ['attempting', 'artiness', 'black', '&', 'white', 'clever', 'camera', 'angles', 'movie', 'disappointed', 'became', 'even', 'ridiculous', 'acting', 'poor', 'plot', 'lines', 'almost', 'non-existent'], ['little', 'music', 'anything', 'speak'], ['best', 'scene', 'movie', 'gerardo', 'trying', 'find', 'song', 'keeps', 'running', 'head']]


In [226]:
wnet=WordNetLemmatizer()

In [227]:
for i in range(len(wordsList)):
    for j in range(len(wordsList[i])):
        wordsList[i][j]=wnet.lemmatize(wordsList[i][j],pos='v')

In [228]:
print(wordsList[:5])

[['slow-moving', 'aimless', 'movie', 'distress', 'drift', 'young', 'man'], ['sure', 'lose', 'flat', 'character', 'audience', 'nearly', 'half', 'walk'], ['attempt', 'artiness', 'black', '&', 'white', 'clever', 'camera', 'angle', 'movie', 'disappoint', 'become', 'even', 'ridiculous', 'act', 'poor', 'plot', 'line', 'almost', 'non-existent'], ['little', 'music', 'anything', 'speak'], ['best', 'scene', 'movie', 'gerardo', 'try', 'find', 'song', 'keep', 'run', 'head']]


In [229]:
cv=CountVectorizer()

In [230]:
wordsList=np.asarray(wordsList)

In [231]:
for i in range(len(wordsList)):
    wordsList[i]=' '.join(wordsList[i])

In [232]:
wordsList[0]

'slow-moving aimless movie distress drift young man'

In [233]:
vect=cv.fit_transform(wordsList)

In [234]:
y=df['Sentiment'].values
x_train,x_test,y_train,y_test=train_test_split(vect,y,test_size=0.25)

In [235]:
reg=LogisticRegression()

In [236]:
reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [237]:
y_pred = reg.predict(x_test)

In [238]:
accuracy_score(y_test,y_pred)

0.7976710334788938

In [239]:
confusion_matrix(y_test,y_pred)

array([[277,  58],
       [ 81, 271]], dtype=int64)

In [240]:
rev = "I loved it. i want to watch it again and the credits are still rolling. i will never stop singing tom holland's praise as the best spiderman we've had. and jake gyllenhaal was fantastic. an all around really great film. funny when it needed to be and emotional when it needed to be."

In [241]:
token = word_tokenize(rev.lower())

In [242]:
tokens = []
for word in token:
    if word not in stopwordsList:
        tokens.append(word)


In [243]:
for i in range(len(tokens)):
    tokens[i] = wnet.lemmatize(tokens[i],pos='v')

In [245]:
sent = ' '.join(tokens)

In [246]:
vect = cv.transform([sent])

In [247]:
reg.predict(vect)

array([1], dtype=int64)