### Read datasetSplit.txt and datasetSentences.txt
Segregate sentences based on sentence index and corresponding data set i.e 1 for train and 2 for test

In [23]:
import pandas as pd
import csv

#List containing sentences belonging to train data set
data_train=[]

#List containing sentences belonging to test data set
data_test=[]

# List containing Sentence ids belonging to train data set
trainIndex=[]

# List containing Sentence ids belonging to test data set
testIndex=[]

with open("datasetSplit.txt","r") as f:
    rd=list(csv.reader(f,delimiter=','))
    count=0
    for line in rd:
        if count==0:
            count=1
            continue       
        if line[1]=='1':
            trainIndex.append(line[0])
        if line[1]=='2':
            testIndex.append(line[0])
            
with open("datasetSentences.txt","r") as f:
    rd=list(csv.reader(f,delimiter='\t'))
    count=0
    for line in rd:
        if count==0:
            count=1
            continue        
        if line[0] in trainIndex:
            data_train.append(line[1])          
        if line[0] in testIndex:         
            data_test.append(line[1])
        

In [24]:
# Converting train and test list to Data Frames


df_train = pd.DataFrame(data_train,columns = ['Sentence'])
df_test = pd.DataFrame(data_test,columns = ['Sentence'])
df_train.head()

Unnamed: 0,Sentence
0,The Rock is destined to be the 21st Century 's...
1,The gorgeously elaborate continuation of `` Th...
2,Singer\/composer Bryan Adams contributes a sle...
3,You 'd think by now America would have had eno...
4,Yet the act is still charming here .


## Train Data Set

###### Create dictionary for Sentence : Phrase Id
Appending 0 for phrase ID in all sentences

In [25]:
data_sentence_train = {}
for item in data_train:
    data_sentence_train[item]='0'

###### Replacing 0 with actual phrase Id 

In [26]:
with open("dictionary.txt","r") as f:
    rd=list(csv.reader(f,delimiter='|'))
    for line in rd:
        if line[0] in data_sentence_train:
            data_sentence_train[line[0]]=line[1]

###### Create data frame from dict for Sentence: Phrase Id

In [27]:
import numpy as np
df_reqd = pd.DataFrame.from_dict(data_sentence_train,orient ='index')
df_reqd['Sentence']=df_reqd.index
df_reqd.columns=['Phrase_Id','Sentence']
df_reqd.index = np.arange(0, len(df_reqd))
df_reqd.head()

Unnamed: 0,Phrase_Id,Sentence
0,226166,The Rock is destined to be the 21st Century 's...
1,226300,The gorgeously elaborate continuation of `` Th...
2,225801,Singer\/composer Bryan Adams contributes a sle...
3,14646,You 'd think by now America would have had eno...
4,14644,Yet the act is still charming here .


###### Create dictionary for Phrase Id : Sentiment data

In [33]:
data_sentiment={}
with open("sentiment_labels.txt","r") as f:
    rd=list(csv.reader(f,delimiter='|'))
    count=0
    for line in rd:
        if count==0:
            count=1
            continue    
        data_sentiment[line[0]]=line[1]
        


###### create data frame 
1. Data Frame containing Sentiment and Phrase Id
2. Data Frame containing Phrase Id and Sentence

Goal -  To merge both data frames based on Phrase Id inorder to get sentiment value for each sentence

In [36]:
df_data_sentiment = pd.DataFrame.from_dict(data_sentiment,orient ='index')
df_data_sentiment['Phrase_Id']=df_data_sentiment.index
df_data_sentiment.columns=['Sentiment','Phrase_Id']

df_reqd = pd.DataFrame.from_dict(data_sentence_train,orient ='index')
df_reqd['Sentence']=df_reqd.index
df_reqd.columns=['Phrase_Id','Sentence']
df_reqd.index = np.arange(0, len(df_reqd))
df_reqd.head()


Unnamed: 0,Phrase_Id,Sentence
0,226166,The Rock is destined to be the 21st Century 's...
1,226300,The gorgeously elaborate continuation of `` Th...
2,225801,Singer\/composer Bryan Adams contributes a sle...
3,14646,You 'd think by now America would have had eno...
4,14644,Yet the act is still charming here .


###### Join both data frame based on Phrase Id

In [37]:
df = pd.merge(df_reqd, df_data_sentiment, on=['Phrase_Id'])
df = df.drop(['Phrase_Id'],axis=1)
df.head()

Unnamed: 0,Sentence,Sentiment
0,The Rock is destined to be the 21st Century 's...,0.69444
1,The gorgeously elaborate continuation of `` Th...,0.83333
2,Singer\/composer Bryan Adams contributes a sle...,0.625
3,You 'd think by now America would have had eno...,0.5
4,Yet the act is still charming here .,0.72222


Assigning labels to the sentiment values

Sentiment            Label

0-0.2     --------    1

0.2-0.4   --------    2

0.4-0.6   --------    3

0.6-0.8   --------    4

0.8-1.0   --------    5

In [51]:
def encodeCategory(x):
    if x>=0 and x<=0.2:
        return 1
    elif x>0.2 and x<=0.4:
        return 2
    elif x>0.4 and x<=0.6:
        return 3
    elif x>0.6 and x<=0.8:
        return 4
    elif x>0.8 and x<1:
        return 5
    else:
        return 3

df['Sentiment']=df['Sentiment'].astype('float').apply(encodeCategory)
df['Sentiment']=df['Sentiment'].astype('int')

In [52]:
df.isnull().sum()

Sentence                      0
Sentiment                     0
Sentence_without_stopwords    0
Sentence_lemmatized           0
dtype: int64

In [53]:
#Creating a column containing sentences without stopwords

import nltk

from nltk.corpus import stopwords
stop = stopwords.words('english')

df['Sentence_without_stopwords'] = df['Sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


In [54]:
#Creating column with lemmatized words

from nltk.stem import WordNetLemmatizer
wordnet = WordNetLemmatizer()

df['Sentence_lemmatized'] = df['Sentence'].apply(lambda x: ' '.join([wordnet.lemmatize(word) for word in x.split()]))

df.head()

Unnamed: 0,Sentence,Sentiment,Sentence_without_stopwords,Sentence_lemmatized
0,The Rock is destined to be the 21st Century 's...,4,The Rock destined 21st Century 's new `` Conan...,The Rock is destined to be the 21st Century 's...
1,The gorgeously elaborate continuation of `` Th...,5,The gorgeously elaborate continuation `` The L...,The gorgeously elaborate continuation of `` Th...
2,Singer\/composer Bryan Adams contributes a sle...,4,Singer\/composer Bryan Adams contributes slew ...,Singer\/composer Bryan Adams contributes a sle...
3,You 'd think by now America would have had eno...,3,You 'd think America would enough plucky Briti...,You 'd think by now America would have had eno...
4,Yet the act is still charming here .,4,Yet act still charming .,Yet the act is still charming here .


## For Test Data

###### Create dictionary for Sentence : Phrase Id
###### Appending 0 for phrase ID in all sentences

In [55]:
data_sentence_test = {}
for item in data_test:
    data_sentence_test[item]='0'

###### Replacing 0 with actual phrase Id 

In [56]:
with open("dictionary.txt","r") as f:
    rd=list(csv.reader(f,delimiter='|'))
    for line in rd:
        if line[0] in data_sentence_test:
            data_sentence_test[line[0]]=line[1]
            
count=0
for key,value in data_sentence_test.items():
    if value==0:
        count+=1


###### Create data frame from dict for Sentence: Phrase Id

In [57]:
import numpy as np
df_reqd_test = pd.DataFrame.from_dict(data_sentence_test,orient ='index')
df_reqd_test['Sentence']=df_reqd_test.index
df_reqd_test.columns=['Phrase_Id','Sentence']
df_reqd_test.index = np.arange(0, len(df_reqd_test))
df_reqd_test.head()

Unnamed: 0,Phrase_Id,Sentence
0,13995,Effective but too-tepid biopic
1,14123,If you sometimes like to go to the movies to h...
2,13999,"Emerges as something rare , an issue movie tha..."
3,14498,The film provides some great insight into the ...
4,14351,Offers that rare combination of entertainment ...


###### Create dictionary for Phrase Id : Sentiment data

In [58]:
data_sentiment_test={}
with open("sentiment_labels.txt","r") as f:
    rd=list(csv.reader(f,delimiter='|'))
    count=0
    for line in rd:
        if count==0:
            count=1
            continue    
        data_sentiment_test[line[0]]=line[1]

###### create data frame from dict for Data_sentiments_test

In [59]:
df_data_sentiment_test = pd.DataFrame.from_dict(data_sentiment_test,orient ='index')
df_data_sentiment_test['Phrase_Id']=df_data_sentiment_test.index
df_data_sentiment_test.columns=['Sentiment','Phrase_Id']
df_data_sentiment_test.head()

Unnamed: 0,Sentiment,Phrase_Id
0,0.5,0
1,0.5,1
2,0.44444,2
3,0.5,3
4,0.42708,4


###### Join both data frame based on Phrase Id

In [60]:
df_test = pd.merge(df_reqd_test, df_data_sentiment_test, on=['Phrase_Id'])
df_test = df_test.drop(['Phrase_Id'],axis=1)


In [61]:
df_test['Sentiment']=df_test['Sentiment'].astype('float').apply(encodeCategory)
df_test.isnull().sum()

Sentence     0
Sentiment    0
dtype: int64

## Train the model

In [82]:
#Training the data with sentence having lemmatized words

data = df['Sentence_lemmatized']

#Replace all the characters except a-z,A-Z with null
data.replace("[^a-zA-Z]"," ",regex=True,inplace=True)

df['Sentence_lemmatized']=df['Sentence_lemmatized'].str.lower()
model_train_Sentence = list(df['Sentence_lemmatized'])

In [83]:
# Cleaning the test data and lemmatizing before prediction

df_test['Sentence_lemmatized'] = df_test['Sentence'].apply(lambda x: ' '.join([wordnet.lemmatize(word) for word in x.split()]))
df_test['Sentence_lemmatized'].replace("[^a-zA-Z]"," ",regex=True,inplace=True)
df_test['Sentence_lemmatized']=df_test['Sentence_lemmatized'].str.lower()


test_transform = list(df_test['Sentence_lemmatized'])


### Trying with different models to vectorize and will compare the accuracy

1. Bag of words model
2. TF-IDF model

#### Bag of words model

In [77]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

countvector = CountVectorizer(ngram_range=(2,2))
traindataset = countvector.fit_transform(model_train_Sentence)

#Used Random Forest for classification

randomClassifier = RandomForestClassifier(n_estimators=200,criterion='entropy')
randomClassifier.fit(traindataset,df['Sentiment'])

test_transform = list(df_test['Sentence_lemmatized'])
test_dataset = countvector.transform(test_transform)
prediction = randomClassifier.predict(test_dataset)

In [78]:
from sklearn.metrics import accuracy_score

score = accuracy_score(df_test['Sentiment'],prediction)
print(score)

0.3253393665158371


#### TF-IDF model

In [92]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
cv = TfidfVectorizer()

X= cv.fit_transform(model_train_Sentence)

#Used Random Forest for classification

randomClassifier = RandomForestClassifier(n_estimators=200,criterion='entropy')
randomClassifier.fit(X,df['Sentiment'])

test_transform = list(df_test['Sentence_lemmatized'])
test_dataset = cv.transform(test_transform)
prediction = randomClassifier.predict(test_dataset)

In [93]:
from sklearn.metrics import accuracy_score

score = accuracy_score(df_test['Sentiment'],prediction)
print(score)

0.3918552036199095


In [None]:
### Accuracy

1. Bag of Words - 0.3253393665158371
2. TF-IDF model - 0.3918552036199095