In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import datetime as dt

%matplotlib inline

In [3]:
df = pd.read_csv('./data/df_with_tags.gz')

In [4]:
df['Review_Date'] = df['Review_Date'].apply(lambda x: dt.datetime.strptime(x,'%Y-%m-%d'))

In [5]:
df['Review_Month'] = df.Review_Month.apply(lambda x: str(x))

### Config

In [6]:
# 4 categories

tmp = []
for i in df.Diff:
    if i < -2:
        tmp.append('Worse')
    elif i < -0.5:
        tmp.append('Bad')
    elif i < 0.5:
        tmp.append('Average')
    elif i >= 0.5:
        tmp.append('Good')

df['Diff_Recode'] = tmp
df.Diff_Recode.value_counts()/len(df.Diff_Recode)

Good       0.477357
Average    0.231899
Bad        0.181239
Worse      0.109505
Name: Diff_Recode, dtype: float64

In [86]:
# 3 categories

tmp = []
for i in df.Diff:
    if i < -0.6:
        tmp.append('Bad')
    elif i < 0.6:
        tmp.append('Average')
    elif i >= 0.6:
        tmp.append('Better')

df['Diff_Recode'] = tmp
df.Diff_Recode.value_counts()

Better     216891
Average    157623
Bad        141224
Name: Diff_Recode, dtype: int64

In [110]:
# 2 categories

tmp = []
for i in df.Diff:
    if i < -0.6:
        tmp.append('Bad')
    elif i >= -0.6:
        tmp.append('Good')

df['Diff_Recode'] = tmp

In [230]:
y_col = 'Diff_Recode'
x_col = ['Review_Month', 'City', 'Pet', 'Purpose', 'Whom', 'Room_Recode', 'Nationality_Recode', 'Length_Recode',
         'Average_Score', 'Total_Number_of_Reviews_Reviewer_Has_Given']

### Balance Categories and UK

In [231]:
nationalities = list(df.Nationality_Recode.value_counts().index)

In [232]:
balanced1 = None

for i in nationalities:
    tmp = df[df.Nationality_Recode == i]
    n = 30000
    if len(tmp) < 30000:
        n = len(tmp)
    tmp = tmp.sample(n)
    balanced1 = pd.concat([balanced1, tmp])

In [233]:
minclass = np.min(balanced1.Diff_Recode.value_counts())
classes = list(balanced1.Diff_Recode.value_counts().index)
minclass, classes

(60144, ['Good', 'Bad'])

In [234]:
balanced2 = None

for i in classes:
    tmp = balanced1[balanced1.Diff_Recode == i].sample(minclass)
    balanced2 = pd.concat([balanced2, tmp])

### Start

In [245]:
df2 = balanced2.sample(n=50000, random_state=1)

In [246]:
X = df2[x_col]
y = df2[y_col]

In [247]:
X = X.fillna('Not Available')

In [248]:
X = pd.get_dummies(X, prefix_sep='_', drop_first=True)

In [249]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

### Gradient Boosting

In [240]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix

clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=4)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test), cohen_kappa_score(pred, y_test)

(0.567, 0.12929821033581335)

In [251]:
confusion_matrix(pred, y_test)

array([[2880, 1954],
       [2098, 3068]], dtype=int64)

In [252]:
pred = clf.predict(X_train)
accuracy_score(pred, y_train), cohen_kappa_score(pred, y_train)

(0.5864, 0.17275504957750643)

### Random Forest

In [243]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 500, max_depth = 10)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test), cohen_kappa_score(pred, y_test)

(0.569, 0.1354062186559679)

### XGBoost

In [211]:
import xgboost as xgb

clf = xgb.XGBClassifier(objective="binary:logistic", n_estimators = 3000)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test), cohen_kappa_score(pred, y_test)

(0.525, 0.050000000000000044)

### Logistic Regresion

In [250]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='lbfgs', max_iter=700)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(pred, y_test), cohen_kappa_score(pred, y_test)

(0.5948, 0.1894815994720508)