In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('/home/akki/Documents/Mini_Projects/movie-review-sentiment-analysis-kernels-only/train.tsv',delimiter='\t',encoding='utf-8')

In [3]:
train.shape

(156060, 4)

In [4]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
train['Phrase'].unique()

array(['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
       'A series of escapades demonstrating the adage that what is good for the goose',
       'A series', ..., 'avuncular chortles', 'avuncular', 'chortles'],
      dtype=object)

In [6]:
train['SentenceId'].unique()

array([   1,    2,    3, ..., 8542, 8543, 8544])

In [7]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [8]:
test['SentenceId'].unique()

array([ 8545,  8546,  8547, ..., 11853, 11854, 11855])

In [9]:
train['Phrase']

0         A series of escapades demonstrating the adage ...
1         A series of escapades demonstrating the adage ...
2                                                  A series
3                                                         A
4                                                    series
                                ...                        
156055                                            Hearst 's
156056                            forced avuncular chortles
156057                                   avuncular chortles
156058                                            avuncular
156059                                             chortles
Name: Phrase, Length: 156060, dtype: object

In [10]:
train['Sentiment'].unique()

array([1, 2, 3, 4, 0])

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [12]:
phrase = pd.DataFrame(train['Phrase'])
phrase

Unnamed: 0,Phrase
0,A series of escapades demonstrating the adage ...
1,A series of escapades demonstrating the adage ...
2,A series
3,A
4,series
...,...
156055,Hearst 's
156056,forced avuncular chortles
156057,avuncular chortles
156058,avuncular


In [13]:
sentiment = pd.DataFrame(train['Sentiment'])
sentiment

Unnamed: 0,Sentiment
0,1
1,2
2,2
3,2
4,2
...,...
156055,2
156056,1
156057,3
156058,2


In [14]:
from sklearn import preprocessing

std_scale = preprocessing.StandardScaler().fit(train[['Sentiment']])
df_std = std_scale.transform(train[['Sentiment']])

minmax_scale = preprocessing.MinMaxScaler().fit(train[['Sentiment']])
df_minmax = minmax_scale.transform(train[['Sentiment']])

In [15]:
from sklearn.preprocessing import LabelEncoder

feat = ['Sentiment']
for x in feat:
    le = LabelEncoder()
    le.fit(list(sentiment[x].values))
    sentiment[x] = le.transform(list(sentiment[x]))

In [16]:
for i,j in enumerate(train['Phrase']):
    train['Phrase'][i] = train['Phrase'][i].lower()
train['Phrase'] = train['Phrase'].str.lower()
train['Phrase']

0         a series of escapades demonstrating the adage ...
1         a series of escapades demonstrating the adage ...
2                                                  a series
3                                                         a
4                                                    series
                                ...                        
156055                                            hearst 's
156056                            forced avuncular chortles
156057                                   avuncular chortles
156058                                            avuncular
156059                                             chortles
Name: Phrase, Length: 156060, dtype: object

In [17]:
def change(t):
    t = t.split()
    return ' '.join([(i) for (i) in t if i not in stop])

In [18]:
from nltk.corpus import stopwords
stop = list(stopwords.words('english'))

In [19]:
remove_list = ['the','at','or','a','of']
for i,j in enumerate(train['Phrase']):
    t = train['Phrase'][i].split()
    train['Phrase'][i] = ' '.join([str(i) for (i) in t if i not in stop])

In [20]:
train['Phrase'].apply(change)

0         series escapades demonstrating adage good goos...
1           series escapades demonstrating adage good goose
2                                                    series
3                                                          
4                                                    series
                                ...                        
156055                                            hearst 's
156056                            forced avuncular chortles
156057                                   avuncular chortles
156058                                            avuncular
156059                                             chortles
Name: Phrase, Length: 156060, dtype: object

In [21]:
vectorizer = TfidfVectorizer(min_df=2, max_features=90000, strip_accents='unicode',lowercase =True,
                            analyzer='word', token_pattern=r'\w+', use_idf=True, 
                            smooth_idf=True, sublinear_tf=True, stop_words = 'english')
vectors = vectorizer.fit_transform(train['Phrase'])
vectors.shape

(156060, 14961)

In [23]:
# splitting the training data into train and valid sets

from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(vectors, sentiment, test_size=0.25, random_state = 42)

print(x_train.shape)
print(x_valid.shape)
print(y_train.shape)
print(y_valid.shape)

(117045, 14961)
(39015, 14961)
(117045, 1)
(39015, 1)


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model = LogisticRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_valid)

print("Training Accuracy :", model.score(x_train, y_train))
print("Validation Accuracy :", model.score(x_valid, y_valid))
print("accuracy_score", accuracy_score(y_valid, y_pred))

Training Accuracy : 0.6744414541415695
Validation Accuracy : 0.6252466999871844
accuracy_score 0.6252466999871844


In [28]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_valid)

print("Training Accuracy :", model.score(x_train, y_train))
print("Validation Accuracy :", model.score(x_valid, y_valid))
print("accuracy_score", accuracy_score(y_valid, y_pred))

Training Accuracy : 0.8634884018967064
Validation Accuracy : 0.6126874279123414
accuracy_score 0.6126874279123414


In [29]:
from sklearn.ensemble import AdaBoostClassifier
clf_adaboost = AdaBoostClassifier(random_state=42)
clf_adaboost.fit(x_train, y_train)
y_pred = model.predict(x_valid)

print("Training Accuracy :", model.score(x_train, y_train))
print("Validation Accuracy :", model.score(x_valid, y_valid))
print("accuracy_score", accuracy_score(y_valid, y_pred))

Training Accuracy : 0.8634884018967064
Validation Accuracy : 0.6126874279123414
accuracy_score 0.6126874279123414
