In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report,confusion_matrix

from sklearn.pipeline import Pipeline

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv('tourist_reviews.csv')

In [5]:
df['labels'].value_counts()

positive    5294
negative    2846
neutral      935
Name: labels, dtype: int64

In [4]:
df.head()

Unnamed: 0,reviews,labels
0,after i was going there on vacation and i was ...,negative
1,cleaning is often not taken care of,negative
2,corruption of the staff is on the rise if you ...,negative
3,nice place to visit with lots of greenery entr...,positive
4,i came back totally disappointed from the top ...,negative


In [5]:
df['reviews'].describe()

count       9075
unique      8574
top       boring
freq          17
Name: reviews, dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9075 entries, 0 to 9074
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   class    9075 non-null   int64 
 1   reviews  9075 non-null   object
dtypes: int64(1), object(1)
memory usage: 141.9+ KB


In [10]:
#Create a dataframe to store results
df_Results = pd.DataFrame(columns=['Model','Accuracy','Precision','Recall'])

In [19]:
# Creating a function to plot confusion matrix
def Plot_confusion_matrix(y_test, y_pred):
  cm = confusion_matrix(y_test, y_pred)
  print(cm)
  plt.clf()
  plt.show()

In [8]:
#Splitting the dataset 
x = df['reviews']
y = df['labels']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 225)

print('X_train :', len(X_train))
print('X_test  :', len(X_test))
print('y_train :', len(y_train))
print('y_test  :', len(y_test))

X_train : 7260
X_test  : 1815
y_train : 7260
y_test  : 1815


In [9]:
tvec = TfidfVectorizer()
X_train_tfidf = tvec.fit_transform(X_train)
X_test_tfidf = tvec.transform(X_test)

In [20]:
#logistic Regression
tvec = TfidfVectorizer()
clf2 = LogisticRegression(solver = "lbfgs")

model = Pipeline([('vectorizer',tvec),('classifier',clf2)])

model.fit(X_train, y_train)
#predicting test set results
y_pred = model.predict(X_test)

# finding Accuaracy 
accuracy_logistic = metrics.accuracy_score(y_pred=y_pred, y_true=y_test)
precision_logistic = metrics.precision_score(y_pred, y_test, average = 'weighted')
recall_logistic = metrics.recall_score(y_pred, y_test, average = 'weighted')

# Making the Confusion Matrix
print("Confusion Matrix")
Plot_confusion_matrix(y_test, y_pred)
print("classification Report")
print(classification_report(y_test, y_pred))

df_Results = df_Results.append(pd.DataFrame({'Model': 'Logistic Regression','Accuracy': accuracy_logistic,'Precision': precision_logistic,'Recall': recall_logistic}, index=[0]),ignore_index= True)

print(df_Results)

Confusion Matrix
[[ 562  149]
 [  54 1050]]


<Figure size 640x480 with 0 Axes>

classification Report
              precision    recall  f1-score   support

           0       0.91      0.79      0.85       711
           1       0.88      0.95      0.91      1104

    accuracy                           0.89      1815
   macro avg       0.89      0.87      0.88      1815
weighted avg       0.89      0.89      0.89      1815

                 Model  Accuracy  Precision    Recall
0  Logistic Regression  0.888154   0.896563  0.888154
1  Logistic Regression  0.888154   0.896563  0.888154


In [18]:
#print("Accuracy : ", accuracy_score(predictions, y_test))
#print("Precision : ", precision_score(predictions, y_test, average = 'weighted'))
#print("Recall : ", recall_score(predictions, y_test, average = 'weighted'))


[[ 562  149]
 [  54 1050]]


In [21]:
#Evaluating Decision Tree model with 'gini' & 'entropy'
criteria = ['gini', 'entropy'] 
scores = {} 

    
for c in criteria: 
    dt = DecisionTreeClassifier(criterion = c, random_state=42) 
    model1 = Pipeline([('vectorizer',tvec),('classifier',dt)])
    model1.fit(X_train, y_train)
    y_pred = model1.predict(X_test)
    test_score = model1.score(X_test, y_test)
    accuracy_tree = metrics.accuracy_score(y_pred=y_pred, y_true=y_test)
    precision_tree = metrics.precision_score(y_pred, y_test, average = 'weighted')
    recall_tree = metrics.recall_score(y_pred, y_test, average = 'weighted')
    print(c + " score: {0}" .format(test_score))
    print("Confusion Matrix")
    Plot_confusion_matrix(y_test, y_pred)
    print("classification Report")
    print(classification_report(y_test, y_pred))
    df_Results = df_Results.append(pd.DataFrame({'Model': 'Tree Model with {0} criteria'.format(c),'Accuracy': accuracy_tree,'Precision': precision_tree,'Recall': recall_tree}, index=[0]),ignore_index= True)
print(df_Results)
    
    

gini score: 0.790633608815427
Confusion Matrix
[[518 193]
 [187 917]]


<Figure size 640x480 with 0 Axes>

classification Report
              precision    recall  f1-score   support

           0       0.73      0.73      0.73       711
           1       0.83      0.83      0.83      1104

    accuracy                           0.79      1815
   macro avg       0.78      0.78      0.78      1815
weighted avg       0.79      0.79      0.79      1815

entropy score: 0.7994490358126721
Confusion Matrix
[[529 182]
 [182 922]]


<Figure size 640x480 with 0 Axes>

classification Report
              precision    recall  f1-score   support

           0       0.74      0.74      0.74       711
           1       0.84      0.84      0.84      1104

    accuracy                           0.80      1815
   macro avg       0.79      0.79      0.79      1815
weighted avg       0.80      0.80      0.80      1815

                              Model  Accuracy  Precision    Recall
0               Logistic Regression  0.888154   0.896563  0.888154
1               Logistic Regression  0.888154   0.896563  0.888154
2     Tree Model with gini criteria  0.790634   0.790971  0.790634
3  Tree Model with entropy criteria  0.799449   0.799449  0.799449


In [22]:
#Random Forest model
# Create the model with 100 trees
RF_model = RandomForestClassifier(n_estimators=100, 
                            bootstrap = True,
                            max_features = 'sqrt', random_state=42)
model2 = Pipeline([('vectorizer',tvec),('classifier',RF_model)])

model2.fit(X_train, y_train)
#predicting test set results
y_pred = model2.predict(X_test)

# finding Accuaracy 
accuracy_rf = metrics.accuracy_score(y_pred=y_pred, y_true=y_test)
precision_rf = metrics.precision_score(y_pred, y_test, average = 'weighted')
recall_rf = metrics.recall_score(y_pred, y_test, average = 'weighted')

# Making the Confusion Matrix
print("Confusion Matrix")
Plot_confusion_matrix(y_test, y_pred)
print("classification Report")
print(classification_report(y_test, y_pred))

df_Results = df_Results.append(pd.DataFrame({'Model': 'Random Forest','Accuracy': accuracy_rf,'Precision': precision_rf,'Recall': recall_rf}, index=[0]),ignore_index= True)

print(df_Results)

Confusion Matrix
[[ 526  185]
 [  56 1048]]


<Figure size 640x480 with 0 Axes>

classification Report
              precision    recall  f1-score   support

           0       0.90      0.74      0.81       711
           1       0.85      0.95      0.90      1104

    accuracy                           0.87      1815
   macro avg       0.88      0.84      0.86      1815
weighted avg       0.87      0.87      0.86      1815

                              Model  Accuracy  Precision    Recall
0               Logistic Regression  0.888154   0.896563  0.888154
1               Logistic Regression  0.888154   0.896563  0.888154
2     Tree Model with gini criteria  0.790634   0.790971  0.790634
3  Tree Model with entropy criteria  0.799449   0.799449  0.799449
4                     Random Forest  0.867218   0.882106  0.867218


In [23]:
example = ["The guided tour was fantastic. The tour guide was engaging and informative, and showed us a side of the city we wouldn't have seen on our own. I would highly recommend it."]

result = model2.predict(example)

print(result)

[1]
