In [None]:
#importing the required packages

import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
nltk.download('stopwords')
from sklearn.metrics import precision_recall_fscore_support as score
import warnings
warnings.filterwarnings("ignore")

In [None]:
#reading in the data with pandas library

df = pd.read_csv('/content/Reviews.csv', sep ='|')

**EXPLORATORY DATA ANALYSIS AND DATA CLEANING**

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.dropna(axis=0, how = 'any', subset = ['buyerTranslationFeedback'], inplace=True)

In [None]:
df.info()

In [None]:
df.drop(axis = 1, columns = ['buyerName','buyerCountry','buyerFeedback','buyerProductFeedBack','evaluationId','responsiveness','downVoteCount','upVoteCount','evalData','warrantyService','functionality','status'], inplace = True)

In [None]:
df['reviews'] = df['buyerTranslationFeedback']
df.drop(columns = ['buyerTranslationFeedback'], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
#defining a function to clean text, it will remove punctuations, tokenize the text and remove stopwords

def review_cleaner_string(text):
  text_nopunct = ''.join([char for char in text if char not in string.punctuation])
  text_tokenized = re.split('\W+', text_nopunct.lower())
  text_nostop = ' '.join([word for word in text_tokenized if word not in nltk.corpus.stopwords.words('english') or word == 'not'])

  #stemming words to reduce words to their base word.
  ps = nltk.PorterStemmer()
  text_stemmed = ' '.join([ps.stem(word) for word in text_nostop.split()])

  return text_stemmed


In [None]:
def review_cleaner_token(text):
  text_nopunct = ''.join([char for char in text if char not in string.punctuation])
  text_tokenized = re.split('\W+', text_nopunct.lower())
  text_nostop = [word for word in text_tokenized if word not in nltk.corpus.stopwords.words('english') or word == 'not']

  #stemming words to reduce words to their base word.
  ps = nltk.PorterStemmer()
  text_stemmed = [ps.stem(word) for word in text_nostop]

  return text_stemmed

In [None]:
#testing cleaning function on random string of characters

In [None]:
review_cleaner_string("hi, i'm not as bad as you think, running run runner")

In [None]:
review_cleaner_token("hi, i'm not as bad as you think, running run runner")

In [None]:
#applying the function to the reviews
df['stemmed_reviews'] = df['reviews'].apply(lambda x: review_cleaner_string(x))

In [None]:
#applying the function to the reviews
df['stemmed_reviews_tokens'] = df['reviews'].apply(lambda x: review_cleaner_token(x))

In [None]:
df.head()

In [None]:
#feature engineering

In [None]:
df['Evaluation'] = df['Evaluation']/20

In [None]:
df['Evaluation'].unique()

In [None]:
df['Evaluation'] = df['Evaluation'].astype(int)

In [None]:
def polarize(x):
  if x > 3:
    return 'positive'
  elif x == 3:
    return 'neutral'
  else:
    return 'negative'


In [None]:
df['polarity'] = df['Evaluation'].apply(lambda x: polarize(x))

In [None]:
df['polarity'].unique()

In [None]:
df.head()

In [None]:
#data visualization

sns.countplot(df, x = 'polarity', palette = 'Set2').set(title = 'No of Reviews for each Polarity Rating')

In [None]:
df.drop(columns = ['Evaluation', 'reviews'], axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
from wordcloud import WordCloud

# Flatten the list of word lists into a single list of words
all_stemmed = ' '.join([word for word in df['stemmed_reviews']])
print(all_stemmed)

# Join the words into a single string

#Create a word cloud from the lemmas
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_stemmed)

# Display the word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Feedback Lemmatized')
plt.show()

In [None]:
counti = len(df.loc[df['polarity']== 'positive'])
counti

In [None]:
df_positive = df.loc[df['polarity']== 'positive'].sample(594, replace=False)
df_neutral = df.loc[df['polarity']== 'neutral']
df_negative = df.loc[df['polarity']== 'negative']

In [None]:
df_neutral = df.loc[df['polarity']== 'neutral'].sample(594, replace=True)
df_negative = df.loc[df['polarity']== 'negative'].sample(594, replace=True)


In [None]:
df = pd.concat([df_positive, df_neutral, df_negative])

In [None]:
df.head()

In [None]:
# Do encoding for Polarity ratings
df.loc[df['polarity']== 'negative', 'polarity'] = 0
df.loc[df['polarity']== 'neutral', 'polarity'] = 1
df.loc[df['polarity']== 'positive', 'polarity'] = 2

In [None]:
# convert the datatype of Polarity_Rating to integer
df['polarity'] = df['polarity'].astype('int64')

In [None]:
df

In [None]:
plt.figure(figsize=(8, 5))  # Set figure size
sns.countplot(data=df, x='polarity', palette='viridis')  # Countplot
plt.title('Count of Unique Values in polarity')  # Title
plt.xlabel('polarity')  # X-axis label
plt.ylabel('Count')  # Y-axis label
plt.show()

In [None]:
X = df['stemmed_reviews_tokens']
y = df["polarity"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
, stratify = y)

In [None]:
tfidf_vect = TfidfVectorizer(analyzer = review_cleaner_token)
tfidf_vect_fit = tfidf_vect.fit(X_train)
Train_X_Tfidf = tfidf_vect_fit.transform(X_train)
Test_X_Tfidf = tfidf_vect_fit.transform(X_test)


In [236]:
# Build a random forest model
import time

In [237]:
rf = RandomForestClassifier(n_estimators = 50, max_depth= 20, n_jobs= -1,  random_state = 42)


In [238]:
start_time = time.time()
rf_model = rf.fit(Train_X_Tfidf, y_train)
end_time = time.time()
fit_time = (end_time - start_time)

In [None]:
# sorted(zip(rf_model.feature_importances_), reverse = True)[0:10]

In [239]:
start_time = time.time()
y_pred = rf_model.predict(Test_X_Tfidf)
end_time = time.time()
pred_time = (end_time - start_time)

In [240]:
precision, recall, fscore, support = score(y_test, y_pred, pos_label=2, average='weighted')

In [241]:
print('fit_time: {}/ pred_time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(round(fit_time, 3),
                                                                                          round(pred_time, 3),
                                                                                          round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred), 3)))

fit_time: 0.229/ pred_time: 0.027 ---- Precision: 0.905 / Recall: 0.905 / Accuracy: 0.905


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [242]:
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11, learning_rate = 0.1)

start_time = time.time()
gb_model = gb.fit(Train_X_Tfidf, y_train)
end_time = time.time()
fit_time = (end_time - start_time)

In [243]:
start_time = time.time()
y_pred_gb = gb_model.predict(Test_X_Tfidf)
end_time = time.time()
pred_time = (end_time - start_time)

In [244]:
precision, recall, fscore, support = score(y_test, y_pred_gb, pos_label=2, average='weighted')

In [245]:
print('fit_time: {}/ pred_time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(round(fit_time, 3),
                                                                                          round(pred_time, 3),
                                                                                          round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred_gb==y_test).sum() / len(y_pred), 3)))

fit_time: 1.844/ pred_time: 0.013 ---- Precision: 0.942 / Recall: 0.941 / Accuracy: 0.941


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
#building a custom grid search

In [None]:
rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
         'max_depth': [10, 20, 30, 60, 90, None]}

In [None]:
gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(Train_X_Tfidf, y_train)

In [None]:
cv = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

In [None]:
cv

In [None]:
def train_RF(n_est, depth):
  rf = RandomForestClassifier(n_estimators = n_est, max_depth= depth, n_jobs= -1,  random_state = 42)
  rf_model = rf.fit(Train_X_Tfidf, y_train)
  y_pred = rf_model.predict(Test_X_Tfidf)
  precision, recall, fscore, support = score(y_test, y_pred, pos_label=2, average='weighted')
  print('Est: {}/ Depth: {} ----Precision: {} / Recall: {} / Accuracy: {}'.format(n_est, depth, round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred), 3)))

In [None]:
for n_est in [10, 50, 100]:
  for depth in [10, 20, 30, None]:
    train_RF(n_est, depth)

In [None]:
def train_GB(est, max_depth, lr):
  gb = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate = lr)
  gb_model = gb.fit(Train_X_Tfidf, y_train)
  y_pred_gb = gb_model.predict(Test_X_Tfidf)
  precision, recall, fscore, support = score(y_test, y_pred_gb, pos_label=2, average='weighted')
  print('Est: {}/ Depth: {}/ LR: {} ----Precision: {} / Recall: {} / Accuracy: {}'.format(est, max_depth, lr, round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred_gb==y_test).sum() / len(y_pred), 3)))

In [None]:
for n_est in [50, 100, 150]:
  for max_depth in [3, 7, 11, 15]:
    for lr in [0.01, 0.1, 1]:
      train_GB(n_est, max_depth, lr)