In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
dataset = pd.read_csv('reviews.csv',usecols=['review_id','review','rating','sentiment'])

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11321 entries, 0 to 11320
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review_id  11321 non-null  object
 1   review     11321 non-null  object
 2   rating     11321 non-null  int64 
 3   sentiment  10601 non-null  object
dtypes: int64(1), object(3)
memory usage: 353.9+ KB


# Data Preprocessing

### Droping reviews with missing sentiment

In [4]:
dataset = dataset.dropna()

### Cleaning reviews 

#### Converting emojis and emoticons to text

In [5]:
pip install emot --upgrade

Requirement already up-to-date: emot in e:\anacondauser\lib\site-packages (3.1)
Note: you may need to restart the kernel to use updated packages.


In [6]:
dataset.loc[7,'review']

'I’ve used Evernote daily since 2013 to journal, organize work streams and plan effective projects. I often joke that it is my external brain because the way I have it categorically organized encompasses most facets of my life. The flexible hierarchy available thru notes, notebooks and tags works well for me. Syncing across devices was also a game changer for me. Truthfully, my biggest fear is that Evernote would ever go away! Please don’t. Yours could forever, Evernote. 💌'

In [7]:
from emot.emo_unicode import UNICODE_EMOJI
def convert_emoji(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
    return text

In [8]:
dataset['review'] = dataset['review'].apply(convert_emoji)

In [9]:
dataset.loc[7,'review']

'I’ve used Evernote daily since 2013 to journal, organize work streams and plan effective projects. I often joke that it is my external brain because the way I have it categorically organized encompasses most facets of my life. The flexible hierarchy available thru notes, notebooks and tags works well for me. Syncing across devices was also a game changer for me. Truthfully, my biggest fear is that Evernote would ever go away! Please don’t. Yours could forever, Evernote. love_letter'

In [10]:
dataset.loc[128,'review']

'Hello, I’d like word in computer, you create a document, and you print it, I mean the basic. On mobile..nope. It’s like paint but blue themed. I mean there can be a lot of improvements and fixes. Also the app crashed my tablet like 3 times before even finishing the download. Ok other then that, I honestly didn’t like it. For a document it’s good you got the basics. But it isn’t enough like... ok maybe on my phone it might be bad and all that but i mean meh. But on the tablet (seeing the pictures in the App Store) it’s good I mean the same for computer but a little bit laggy and bugs and crashes. There can be a LOOT of différents if you could just work on it better.Ok. As an app. I can say meh. Not bad. Just less settings in the mob and tabs. But other then that crashes. Fix these and make the layout better :) also the app takes a lot to download and a lot to start it up. Make this a generator instead of a non deleting paper for like ever. Thanks for reading'

In [11]:
from emot.emo_unicode import EMOTICONS_EMO
def convert_emoticons(text):
    for emot in EMOTICONS_EMO:
        text = text.replace(emot, "_".join(EMOTICONS_EMO[emot].replace(",","").split()))
    return text

In [12]:
dataset['review'] = dataset['review'].apply(convert_emoticons)

In [13]:
dataset.loc[128,'review']

'Hello, I’d like word in computer, you create a document, and you print it, I mean the basic. On mobile..nope. It’s like paint but blue themed. I mean there can be a lot of improvements and fixes. Also the app crashed my tablet like 3 times before even finishing the download. Ok other then that, I honestly didn’t like it. For a document it’s good you got the basics. But it isn’t enough like... ok maybe on my phone it might be bad and all that but i mean meh. But on the tablet (seeing the pictures in the App Store) it’s good I mean the same for computer but a little bit laggy and bugs and crashes. There can be a LOOT of différents if you could just work on it better.Ok. As an app. I can say meh. Not bad. Just less settings in the mob and tabs. But other then that crashes. Fix these and make the layout better Happy_face_or_smiley also the app takes a lot to download and a lot to start it up. Make this a generator instead of a non deleting paper for like ever. Thanks for reading'

#### Replace a sequence of repeated characters with two characters

In [14]:
dataset.iloc[-1,1]

"I love Monopoly and when this game is working it is soooooooooo much fun. But it freezes way too often. If it was simply a free game with in App Purchases, I would not mind at all. But I pay for the fastest fiber optics network around, and even with that connection I can't normally make it through an online game. Every time I try it freezes at some point and I can't continue playing. For a game you have to pay $4 just to play normally, I really think that's a factor that needs to be addressed. But Pass and Play is a fun way to pass time when you're traveling with someone or are waiting on an appointment, and playing the AI can still be a fun experience as well. But please for the price you charge get the online play feature tuned up. Even if you toned down the graphics to decrease the chance of it freezing I'm sure people would still love to play!"

In [15]:
pattern = re.compile(r'(.)\1*')  
def reduce_sequence_word(word):
     return ''.join([match.group()[:2] if len(match.group()) > 2 else match.group() for match in pattern.finditer(word)]) 
def reduce_sequence_review(review):
     return ' '.join([reduce_sequence_word(word) for word in review.split(' ')])  

In [16]:
dataset['review'] = dataset['review'].apply(reduce_sequence_review)

In [17]:
dataset.iloc[-1,1]

"I love Monopoly and when this game is working it is soo much fun. But it freezes way too often. If it was simply a free game with in App Purchases, I would not mind at all. But I pay for the fastest fiber optics network around, and even with that connection I can't normally make it through an online game. Every time I try it freezes at some point and I can't continue playing. For a game you have to pay $4 just to play normally, I really think that's a factor that needs to be addressed. But Pass and Play is a fun way to pass time when you're traveling with someone or are waiting on an appointment, and playing the AI can still be a fun experience as well. But please for the price you charge get the online play feature tuned up. Even if you toned down the graphics to decrease the chance of it freezing I'm sure people would still love to play!"

#### Removing non alphabetic words, stop words and  word stemming

In [18]:
import re
import nltk
nltk.download('stopwords')
#nltk.download('wordnet')
from nltk.corpus import stopwords
# This is stemming
from nltk.stem.porter import PorterStemmer
#this is lemmatization
#from nltk.stem import WordNetLemmatizer
stopwords_to_consider = []
stopwords_to_consider = ['on','off','no','nor','not',"don't",'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
def text_preprocessor(review):
    processed_r = re.sub('[^a-zA-z]',' ',review)
    processed_r = processed_r.lower()
    processed_r = processed_r.split()
    all_stopwords = [w for w in stopwords.words('english') if not w in stopwords_to_consider]
    # This is stemming
    ps = PorterStemmer()
    processed_r = [ps.stem(word) for word in processed_r if not word in set(all_stopwords)]
    #this is lemmatization
    #lemmatizer = WordNetLemmatizer()
    #processed_r = [lemmatizer.lemmatize(word) for word in processed_r if not word in set(all_stopwords)]
    return ' '.join(processed_r)
    

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
dataset['review'] = dataset['review'].apply(text_preprocessor)

In [20]:
dataset = dataset[dataset['review'].str.len() > 20]

In [21]:
dataset.head(5)

Unnamed: 0,review_id,review,rating,sentiment
0,c9274c0a-a120-4e09-816b-7a8ba3a16634,new version thing entir differ aesthet thing t...,3,positive
2,506230e3-cc98-4233-be40-89e52d53990c,lot peopl use word standard run problem use ev...,1,negative
3,69d44a5e-218f-4f55-8a99-6cca55d43ca1,origin skeptic on pay on todo list app come re...,5,positive
4,3d0b634c-d402-47e8-ba7d-bf6209fed826,use go app note take year problem write use re...,1,negative
5,7310f23f-06c2-447d-aafd-81b29765169d,use year tag capabl uniqu integr across platfo...,5,positive


In [22]:
X = dataset.iloc[:,1:-1].values
y = dataset.iloc[:,-1].values

In [23]:
X

array([['new version thing entir differ aesthet thing thing much minimalist thing seem lot ui bloat not quit sure design award came',
        3],
       ['lot peopl use word standard run problem use everi updat problem format everyth on page partli on page on part partli on page forth throughout page document tri call support reach someon bare speak english tell happen say chang someth document ye close document yesterday format correctli updat app today open wrong must middl night switch appl page thank peopl work realli hard make work realli great effort part not fun issu happen on macbook ipad point set someth exactli want simpl updat chang everyth nonsens',
        1],
       ['origin skeptic on pay on todo list app come realiz great invest abl creat mani differ header project categor differ class take no longer use physic planner app take care problem eas use simplic great function led rate app star definit worth price',
        5],
       ...,
       ['game great pass time line w

### Label encoding the dependent variable

In [24]:
y

array(['positive', 'negative', 'positive', ..., 'negative', 'positive',
       'negative'], dtype=object)

In [25]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [26]:
print(le.classes_)
print(y)

['negative' 'positive']
[1 0 1 ... 0 1 0]


### Creating the Bag of Words model

In [27]:
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer
preprocess = ColumnTransformer(transformers=[('vectorize', TfidfVectorizer(ngram_range=(1,2),max_features = 3000),0)],remainder='passthrough')
X = preprocess.fit_transform(X).toarray()

In [28]:
len(X[0,:])

3001

### Spliting dataset

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=1)

In [30]:
X_train


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        5.        ],
       [0.        , 0.11219596, 0.        , ..., 0.        , 0.        ,
        4.        ],
       [0.        , 0.08591083, 0.08467555, ..., 0.        , 0.        ,
        2.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        5.        ],
       [0.16847719, 0.        , 0.        , ..., 0.        , 0.        ,
        5.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        3.        ]])

In [31]:
y_train

array([1, 0, 0, ..., 1, 0, 0])

# Classification 

### Training model - Naive Bayes

In [32]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train,y_train)

GaussianNB()

### Result evaluation

In [33]:
y_pred = classifier.predict(X_test)

##### Accuracy and confustion matrix

In [34]:
from sklearn.metrics import accuracy_score,confusion_matrix
cm =confusion_matrix(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
print(cm)
print("The accuracy of model is: {:0.2f}".format(accuracy*100))

[[899 431]
 [534 784]]
The accuracy of model is: 63.56


##### KFold Cross Validation

In [35]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("The Average Accuracy of model is: {:.2f} %".format(accuracies.mean()*100))
print("With a Standard Deviation of: {:.2f} %".format(accuracies.std()*100))

The Average Accuracy of model is: 63.49 %
With a Standard Deviation of: 1.84 %


#### Precision recall and f-measure

In [36]:
from sklearn.metrics import precision_score,recall_score,f1_score
p_score = precision_score(y_test,y_pred)
r_score = recall_score(y_test,y_pred)
f_score = f1_score(y_test,y_pred)
print("The precision of model is: {:0.2f}".format(p_score*100))
print("The recall of model is: {:0.2f}".format(r_score*100))
print("The f-measure of model is: {:0.2f}".format(f_score*100))

The precision of model is: 64.53
The recall of model is: 59.48
The f-measure of model is: 61.90


### Training model - Kernel SVM

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

### Result evaluation

##### Accuracy and confustion matrix

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
cm =confusion_matrix(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
print(cm)
print("The accuracy of model is: {:0.2f}".format(accuracy*100))

##### KFold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("The Average Accuracy of model is: {:.2f} %".format(accuracies.mean()*100))
print("With a Standard Deviation of: {:.2f} %".format(accuracies.std()*100))

#### Precision recall and f-measure

In [None]:
from sklearn.metrics import precision_score,recall_score,f1_score
p_score = precision_score(y_test,y_pred)
r_score = recall_score(y_test,y_pred)
f_score = f1_score(y_test,y_pred)
print("The precision of model is: {:0.2f}".format(p_score*100))
print("The recall of model is: {:0.2f}".format(r_score*100))
print("The f-measure of model is: {:0.2f}".format(f_score*100))