In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [4]:
df_reviews = pd.read_csv('bluemovement.csv')
df_reviews.head()

Unnamed: 0.1,Unnamed: 0,review_title,review_date,review_text,page_number,rating,title_text_concatenated,title_text_translated
0,0,It was a breeze booking the Washing…,2022-06-11,It was a breeze booking the Washing machine fr...,1,4,It was a breeze booking the Washing machine fr...,It was a breeze booking the Washing machine fr...
1,1,Alles is subliem afgeleverd .Perfect,2022-05-28,Alles is subliem afgeleverd .,1,5,Alles is subliem afgeleverd . Alles is subliem...,Everything was delivered perfectly . Everythin...
2,2,Very bad company,2022-05-27,Very bad company. Paying way too much for even...,1,1,Very bad company. Paying way too much for even...,Very bad company. Paying way too much for even...
3,3,I really enjoyed the service,2022-05-18,"I really enjoyed the service, quite fast and p...",1,5,"I really enjoyed the service, quite fast and p...","I really enjoyed the service, quite fast and p..."
4,4,Very fast service,2022-05-13,"Very fast service , the drivers were very fast...",1,5,"Very fast service , the drivers were very fast...","Very fast service , the drivers were very fast..."


In [8]:
# Classify ratings
df_reviews.loc[df_reviews['rating'] > 3, 'liked'] = 1
df_reviews.loc[df_reviews['rating'] <= 3, 'liked'] = 0
df_reviews['liked'] = df_reviews['liked'].astype(int)
df_reviews.head()

Unnamed: 0.1,Unnamed: 0,review_title,review_date,review_text,page_number,rating,title_text_concatenated,title_text_translated,liked
0,0,It was a breeze booking the Washing…,2022-06-11,It was a breeze booking the Washing machine fr...,1,4,It was a breeze booking the Washing machine fr...,It was a breeze booking the Washing machine fr...,1
1,1,Alles is subliem afgeleverd .Perfect,2022-05-28,Alles is subliem afgeleverd .,1,5,Alles is subliem afgeleverd . Alles is subliem...,Everything was delivered perfectly . Everythin...,1
2,2,Very bad company,2022-05-27,Very bad company. Paying way too much for even...,1,1,Very bad company. Paying way too much for even...,Very bad company. Paying way too much for even...,0
3,3,I really enjoyed the service,2022-05-18,"I really enjoyed the service, quite fast and p...",1,5,"I really enjoyed the service, quite fast and p...","I really enjoyed the service, quite fast and p...",1
4,4,Very fast service,2022-05-13,"Very fast service , the drivers were very fast...",1,5,"Very fast service , the drivers were very fast...","Very fast service , the drivers were very fast...",1


In [9]:
import re # library to clean data
import nltk # Natural Language Tool Kit
 
nltk.download('stopwords')

from nltk.corpus import stopwords # to remove stopword
from nltk.stem.porter import PorterStemmer # for Stemming propose

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\can_a\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Initialize empty array to append clean text
corpus = []
 
# (reviews) to clean
for i in range(len(df_reviews)):
     
    # column : "title_text_translated", row ith
    review = re.sub('[^a-zA-Z]', ' ', df_reviews['title_text_translated'][i])
     
    # convert all cases to lower cases
    review = review.lower()
     
    # split to array(default delimiter is " ")
    review = review.split()
     
    # creating PorterStemmer object to take main stem of each word
    ps = PorterStemmer()
     
    # loop for stemming each word in string array at ith row   
    review = [ps.stem(word) for word in review
                if not word in set(stopwords.words('english'))]
                 
    # rejoin all string array elements to create back into a string
    review = ' '.join(review) 
     
    # append each string to create array of clean text
    corpus.append(review)

In [13]:
corpus

['breez book wash machin blue movement bit sceptic begin howev perfectli deliv time staff fix machin home make sure understood oper machin breez book wash',
 'everyth deliv perfectli everyth deliv perfectli perfect',
 'bad compani pay way much even nd hand stuff crimin bad compani',
 'realli enjoy servic quit fast profession realli enjoy servic',
 'fast servic driver fast friendli call minut instal wash machin koolcast show everyth work fast servic',
 'great servic great product quick deliveri great servic great product quick',
 'happi experi far deliveri instal quick profession applianc met expect satisfi happi experi far',
 'far quit happi wash machin actual exceed expect give star deliveri quick smooth desir someth went wrong deliveri compani brought dryer week wait far quit happi wash',
 'good servic',
 'experienc great servic move old wash machin old place also deliv fridg within day notic gentlemen help carri set everyth nice easi interact great servic',
 'total bad inform disinf

In [17]:
df_reviews.iloc[:, 8].values # must be set to 'liked' binary column

array([1, 1, 0, ..., 1, 1, 1])

In [69]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
 
# To extract max 20 feature. "max_features" is attribute to experiment with to get better results
cv = CountVectorizer(max_features = 20)
 
# X contains corpus (dependent variable)
X = cv.fit_transform(corpus).toarray()
 
# y contains answers if review is positive or negative
y = df_reviews.iloc[:, 8].values

In [70]:
X, y

(array([[0, 0, 1, ..., 0, 2, 0],
        [0, 0, 2, ..., 0, 0, 0],
        [2, 2, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 1, 0, 0],
        [0, 0, 2, ..., 0, 2, 1],
        [0, 0, 0, ..., 1, 0, 0]], dtype=int64),
 array([1, 1, 0, ..., 1, 1, 1]))

In [82]:
# Splitting the dataset into the Training and Test set
from sklearn.model_selection import train_test_split

# experiment with "test_size" to get better results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [89]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
 
# n_estimators can be said as number of trees, experiment with n_estimators to get better results
model = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')
                             
model.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy')

In [90]:
# Predicting the Test set results
y_pred = model.predict(X_test)

y_pred

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [91]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
 
cm = confusion_matrix(y_test, y_pred)
 
cm

array([[ 39,   1],
       [  0, 378]], dtype=int64)