## Loading the data set

In [0]:
!pip install -U -q kaggle
!mkdir -p ~/.kaggle

In [0]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle (1).json


{'kaggle.json': b'{"username":"anushkaparadkar","key":"600c091ab5dab7c51128e613096a44e9"}'}

In [0]:
!cp kaggle.json ~/.kaggle/

In [0]:
!kaggle datasets list -s Rotten

ref                                                           title                                             size  lastUpdated          downloadCount  
------------------------------------------------------------  ------------------------------------------------  ----  -------------------  -------------  
rpnuser8182/rotten-tomatoes                                   Rotten Tomatoes Movie Reviews                      4MB  2018-12-12 01:46:48            290  
sriramr/fruits-fresh-and-rotten-for-classification            Fruits fresh and rotten for classification         2GB  2018-08-24 15:05:40            275  
ayushkalla1/rotten-tomatoes-movie-database                    Rotten Tomatoes Movie Database                    12MB  2018-12-05 07:15:59            132  
nicolasgervais/rotten-tomatoes-480000-labeled-critic-reviews  Rotten Tomatoes: 480,000 Labeled Critic Reviews   26MB  2019-03-26 17:47:02             54  
iconix/ulmfit-rt                                              ULMFiT f

In [0]:
!kaggle datasets download -d abhipoo/sentiment-rotten-tomatoes --force

Downloading sentiment-rotten-tomatoes.zip to /content
  0% 0.00/1.76M [00:00<?, ?B/s]
100% 1.76M/1.76M [00:00<00:00, 173MB/s]


In [0]:
!unzip sentiment-rotten-tomatoes.zip


Archive:  sentiment-rotten-tomatoes.zip
replace train.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: train.tsv               
  inflating: test.tsv                


## Cleaning Data set

In [0]:
#importing all libraraies
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import nltk

In [0]:
df=pd.read_csv('train.tsv',sep='\t')
df1=pd.read_csv('test.tsv',sep='\t')

In [0]:
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [0]:
#dropping unnecessary columns
df= df.drop(['PhraseId','SentenceId'],axis=1)
df.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


In [0]:

import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

df['Processed_Reviews'] = df.Phrase.apply(lambda x: clean_text(x))

In [0]:
df.head()

Unnamed: 0,Phrase,Sentiment,Processed_Reviews
0,A series of escapades demonstrating the adage ...,1,series escapade demonstrate adage good goose a...
1,A series of escapades demonstrating the adage ...,2,series escapade demonstrate adage good goose
2,A series,2,series
3,A,2,
4,series,2,series


In [0]:
#creating bag of words
vectorizer=CountVectorizer(analyzer="word", \
                           tokenizer=None,  \
                           stop_words=None,  \
                           preprocessor=None, \
                           max_features=5000)
train_data_features=vectorizer.fit_transform(df)

In [0]:
from sklearn.model_selection import train_test_split
X = df['Processed_Reviews']
y = df['Sentiment']
msg_train,msg_test,label_train,label_test = train_test_split(X,y)

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
pipelineRFC = Pipeline([
    ('bow',CountVectorizer(analyzer="word")),
    ('tfidf',TfidfTransformer()),
    ('classifier',RandomForestClassifier())
])

In [0]:
from sklearn.metrics import classification_report
pipelineRFC.fit(msg_train,label_train)
preds = pipelineRFC.predict(msg_test)
print(classification_report(label_test,preds))



              precision    recall  f1-score   support

           0       0.44      0.40      0.42      1719
           1       0.53      0.49      0.51      6782
           2       0.72      0.79      0.76     20000
           3       0.54      0.50      0.52      8189
           4       0.49      0.38      0.43      2325

    accuracy                           0.64     39015
   macro avg       0.55      0.51      0.53     39015
weighted avg       0.63      0.64      0.63     39015



In [0]:
test_data = pd.read_csv('test.tsv',sep='\t')

In [0]:
test_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [0]:
test_predictions = pipelineRFC.predict(test_data['Phrase'])


In [0]:
test_predictions = pipelineRFC.predict(test_data['Phrase'])
phrase_id = test_data['PhraseId'].values

In [0]:
final_model = pd.DataFrame({'PhraseId':phrase_id,'Sentiment':test_predictions})
final_model.head()


Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2


In [0]:
file_name="Moview Reviews Sentiment analysis.csv"
final_model.to_csv(file_name,index=False)

print('Saved file: ' + file_name)

Saved file: Moview Reviews Sentiment analysis.csv
