# How to make Sentiment Analysis Model using Random Forest Classifer

### Import required packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Read training data

In [2]:
df = pd.read_csv("sentiment_labelled.csv", encoding='latin-1')

In [3]:
df.head(20)

Unnamed: 0,review,label
0,not bad,1
1,not good,0
2,Wow... Loved this place.,1
3,Crust is not good.,0
4,Not tasty and the texture was just nasty.,0
5,Stopped by during the late May bank holiday of...,1
6,The selection on the menu was great and so wer...,1
7,Now I am getting angry and I want my damn pho.,0
8,Honeslty it didn't taste THAT fresh.),0
9,The potatoes were like rubber and you could te...,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1753 entries, 0 to 1752
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  1753 non-null   object
 1   label   1753 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 27.5+ KB


In [5]:
df.tail()

Unnamed: 0,review,label
1748,Exceptionally bad!,0
1749,All in all its an insult to one's intelligence...,0
1750,food sucks,0
1751,not good food,0
1752,sucks,0


In [6]:
df.iloc[0,0]

'not bad'

In [7]:
df.iloc[-1,0]

'sucks'

### Vectorize data

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

vectorizer = CountVectorizer()
reviews = vectorizer.fit_transform(df['review'].values)

In [9]:
vectorizer.get_feature_names()

['00',
 '10',
 '100',
 '11',
 '12',
 '13',
 '15',
 '15pm',
 '17',
 '18th',
 '1928',
 '1947',
 '1948',
 '1949',
 '1971',
 '1973',
 '1979',
 '1980',
 '1986',
 '1995',
 '1998',
 '20',
 '2005',
 '2006',
 '2007',
 '20th',
 '23',
 '25',
 '30',
 '30s',
 '35',
 '40',
 '40min',
 '45',
 '4ths',
 '50',
 '54',
 '5lb',
 '70',
 '70000',
 '70s',
 '80',
 '80s',
 '85',
 '8pm',
 '90',
 '95',
 '99',
 'aailiyah',
 'abandoned',
 'ability',
 'about',
 'above',
 'abroad',
 'absolute',
 'absolutely',
 'absolutley',
 'abstruse',
 'abysmal',
 'academy',
 'accents',
 'accessible',
 'accident',
 'acclaimed',
 'accolades',
 'accommodations',
 'accomodate',
 'accountant',
 'accurate',
 'accurately',
 'accused',
 'ache',
 'achievement',
 'achille',
 'ackerman',
 'acknowledged',
 'across',
 'act',
 'acted',
 'acting',
 'action',
 'actions',
 'actor',
 'actors',
 'actress',
 'actresses',
 'actual',
 'actually',
 'adams',
 'adaptation',
 'add',
 'added',
 'addition',
 'admins',
 'admiration',
 'admitted',
 'adorable',


In [10]:
print (reviews)

  (0, 2523)	1
  (0, 296)	1
  (1, 2523)	1
  (1, 1631)	1
  (2, 4206)	1
  (2, 2223)	1
  (2, 3745)	1
  (2, 2760)	1
  (3, 2523)	1
  (3, 1631)	1
  (3, 902)	1
  (3, 1992)	1
  (4, 2523)	1
  (4, 3678)	1
  (4, 161)	1
  (4, 3720)	1
  (4, 3713)	1
  (4, 4082)	1
  (4, 2052)	1
  (4, 2465)	1
  (5, 2223)	1
  (5, 161)	1
  (5, 3720)	1
  (5, 3534)	1
  (5, 529)	1
  :	:
  (1747, 1997)	1
  (1747, 1910)	1
  (1747, 4184)	1
  (1747, 1214)	1
  (1748, 296)	1
  (1748, 1301)	1
  (1749, 161)	1
  (1749, 2553)	1
  (1749, 3786)	1
  (1749, 1910)	1
  (1749, 130)	2
  (1749, 159)	1
  (1749, 2578)	1
  (1749, 1853)	1
  (1749, 4087)	1
  (1749, 2402)	1
  (1749, 2000)	1
  (1749, 1963)	1
  (1749, 1957)	1
  (1750, 1483)	1
  (1750, 3594)	1
  (1751, 2523)	1
  (1751, 1631)	1
  (1751, 1483)	1
  (1752, 3594)	1


In [11]:
reviews.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [12]:
reviews.shape

(1753, 4248)

### Build model

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
classifier = RandomForestClassifier()

# targets is the classification of every email you have encountered
targets = df['label'].values

# this will create a model using naive bayes
classifier.fit(reviews, targets)

RandomForestClassifier()

In [15]:
def sentiment(input):
    input_vect = vectorizer.transform(input)
    predictions = classifier.predict(input_vect)
    print(predictions)
    if predictions > 0:
        print(" Positive review")
        print(classifier.predict_proba(input_vect))
    else:
        print("Negative review")
        print(classifier.predict_proba(input_vect))

### Analyze sentiment

In [None]:
comment_list = []
comment = input("Comment = ")
comment_list.append(comment)
sentiment(comment_list)