# Sentiment Analysis 

Reference : https://medium.com/analytics-vidhya/sentiment-analysis-on-amazon-reviews-using-tf-idf-approach-c5ab4c36e7a1

In [None]:
import pandas as pd
import numpy as np
import nltk
import re

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
# Download stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
df = pd.read_csv('sample_text_rating_Merged.csv', encoding='ISO-8859-1')

In [None]:
df.head()

Unnamed: 0,comment,rating
0,parking is easy and the parking fee is reasonable,5
1,this is one of the best aquarium,5
2,i was blown away by the exhibits.,5
3,everything was so cool,4
4,definitely worth seeing,4


In [None]:
df.describe()

Unnamed: 0,rating
count,8547.0
mean,2.921727
std,1.747541
min,1.0
25%,1.0
50%,2.0
75%,5.0
max,5.0


In [None]:
df.rating.value_counts()

5    3263
1    2638
2    2233
4     314
3      99
Name: rating, dtype: int64

## Pre-Processing

In [None]:
# Labelling reviews as positive or negative
df['rating']=df['rating'].astype(int) #convert the star_rating column to int
#df=df[df['rating]!=3]
df['label']=np.where(df['rating']>=4,1,0) #1-Positve,0-Negative

In [None]:
# Convert all reviews to lower
df['pre_process'] = df['comment'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))

### Perform Contractions on the reviews

Example it won't be will be converted as it will not be

In [None]:
def contractions(s):
 s = re.sub(r'won’t', 'will not',s)
 s = re.sub(r'would’t', 'would not',s)
 s = re.sub(r'could’t', 'could not',s)
 s = re.sub(r'\’d', ' would',s)
 s = re.sub(r'can\’t', 'can not',s)
 s = re.sub(r'n\’t', ' not', s)
 s= re.sub(r'\’re', ' are', s)
 s = re.sub(r'\’s', ' is', s)
 s = re.sub(r'\’ll', ' will', s)
 s = re.sub(r'\’t', ' not', s)
 s = re.sub(r'\’ve', ' have', s)
 s = re.sub(r'\’m', ' am', s)
 return s
df['pre_process']=df['pre_process'].apply(lambda x:contractions(x))

## Remove the non-alphabetic charcters and extra spaces in between

In [None]:
df['pre_process']=df['pre_process'].apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))
#df['pre_process']=df['pre_process'].apply(lambda x: re.sub('+', ' ', x))

In [None]:
# Remove the stop words
stop = stopwords.words('english')
df['pre_process']=df['pre_process'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

## Perform lemmatization using the wordnet lemmatizer


In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['pre_process']=df['pre_process'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))

## Computing IDF :
idf(t) = log [ n / df(t) ] + 1  = log[ number of documents / number of documents containing the term]+1

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train, Y_test = train_test_split(df['pre_process'], df['label'], test_size=0.25, random_state=30)
print("Train: ",X_train.shape,Y_train.shape,"Test: ",(X_test.shape,Y_test.shape))

Train:  (6410,) (6410,) Test:  ((2137,), (2137,))


## Using TF*IDF Vectorizer

In [None]:
print("TFIDF Vectorizer……")
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

TFIDF Vectorizer……


## We will try SVM and Logistic Regression 

### SVM

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)

In [None]:
# Fitting the Training data into model
clf.fit(tf_x_train,Y_train)

LinearSVC(random_state=0)

In [None]:
# Predicting the test data
y_test_pred = clf.predict(tf_x_test)

## Analyzing results

In [None]:
from sklearn.metrics import classification_report
report = classification_report(Y_test, y_test_pred, output_dict = True)

In [None]:
report

{'0': {'precision': 0.9169303797468354,
  'recall': 0.9407467532467533,
  'f1-score': 0.9286858974358974,
  'support': 1232},
 '1': {'precision': 0.9163802978235968,
  'recall': 0.8839779005524862,
  'f1-score': 0.8998875140607424,
  'support': 905},
 'accuracy': 0.9167056621431914,
 'macro avg': {'precision': 0.9166553387852161,
  'recall': 0.9123623268996197,
  'f1-score': 0.9142867057483199,
  'support': 2137},
 'weighted avg': {'precision': 0.9166974250718093,
  'recall': 0.9167056621431914,
  'f1-score': 0.9164900448600831,
  'support': 2137}}

### So by using an SVM classifier, we got an accuracy of 91.44 % 

NB : This is only for Positive / Negative classification. Not for the star ratings


In [None]:
# Checking for the full 5 labels
tfidf = TfidfVectorizer(max_features = 20000, ngram_range=(1,5),analyzer = 'char')

In [None]:
X = tfidf.fit_transform(df['comment'])
y=df['rating']

In [None]:
X.shape , y.shape

((8547, 20000), (8547,))

In [None]:
X_train,X_test,y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 0)

In [None]:
X_train.shape

(6837, 20000)

In [None]:
clf = LinearSVC()
clf.fit(X_train,y_train)

LinearSVC()

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.60      0.68      0.64       516
           2       0.57      0.51      0.54       461
           3       1.00      0.05      0.10        19
           4       0.27      0.07      0.11        57
           5       0.89      0.94      0.92       657

    accuracy                           0.71      1710
   macro avg       0.67      0.45      0.46      1710
weighted avg       0.70      0.71      0.69      1710



accuracy is only 70%. Besides for the negative sentiments, the scores are bad
The dataset is imbalanced. So accuracy may not be the right metric to measure the performance. 
Otherwise we have to do dataset balancing.
LinearSVC has class weight parameter, which will try to improve the precision, recall, F1 score of the low rating stars

In [None]:
# Introduing regularization
clf = LinearSVC(C=20,class_weight = 'balanced',max_iter=1200000)
clf.fit(X_train,y_train)

LinearSVC(C=20, class_weight='balanced', max_iter=1200000)

The C parameter tells the SVM optimization how much you want to avoid misclassifying each training example. For large values of C, the optimization will choose a smaller-margin hyperplane if that hyperplane does a better job of getting all the training points classified correctly. Conversely, a very small value of C will cause the optimizer to look for a larger-margin separating hyperplane, even if that hyperplane misclassifies more points. For very tiny values of C, you should get misclassified examples, often even if your training data is linearly separable.

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.57      0.63      0.60       516
           2       0.52      0.48      0.50       461
           3       0.43      0.16      0.23        19
           4       0.21      0.21      0.21        57
           5       0.89      0.90      0.89       657

    accuracy                           0.67      1710
   macro avg       0.53      0.47      0.49      1710
weighted avg       0.67      0.67      0.67      1710



In [None]:
X = 'This park is really bad. I do not like it.'
vec = tfidf.transform([X])
clf.predict(vec)

array([1])

In [None]:
X = 'This park is really good. I enjoyed quite a lot.'
vec = tfidf.transform([X])
clf.predict(vec)

array([5])

In [None]:
X = 'This park is really good. But I did not find place to sit.'
vec = tfidf.transform([X])
clf.predict(vec)

In [None]:
X = 'This park is really good. But it was too crowded.'
vec = tfidf.transform([X])
clf.predict(vec)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1000,solver='saga')

In [None]:
clf.fit(tf_x_train,Y_train)

LogisticRegression(max_iter=1000, solver='saga')

In [None]:
y_test_pred=clf.predict(tf_x_test)

In [None]:
from sklearn.metrics import classification_report
report=classification_report(Y_test, y_test_pred,output_dict=True)

In [None]:
report

{'0': {'precision': 0.8957854406130268,
  'recall': 0.9488636363636364,
  'f1-score': 0.921560898699251,
  'support': 1232},
 '1': {'precision': 0.9242788461538461,
  'recall': 0.8497237569060774,
  'f1-score': 0.8854346574553827,
  'support': 905},
 'accuracy': 0.9068788020589612,
 'macro avg': {'precision': 0.9100321433834364,
  'recall': 0.8992936966348568,
  'f1-score': 0.9034977780773168,
  'support': 2137},
 'weighted avg': {'precision': 0.9078521378589048,
  'recall': 0.9068788020589612,
  'f1-score': 0.906261765182311,
  'support': 2137}}

In [None]:
print(classification_report(Y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.90      0.95      0.92      1232
           1       0.92      0.85      0.89       905

    accuracy                           0.91      2137
   macro avg       0.91      0.90      0.90      2137
weighted avg       0.91      0.91      0.91      2137



In [None]:
X = 'This park is really good. I enjoyed quite a lot.'
vec = tfidf.transform([X])
clf.predict(vec)

ValueError: ignored