In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.linear_model import LinearRegression, HuberRegressor, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import mean_squared_error

In [3]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.utils import to_categorical

In [4]:
df = pd.read_csv("data/yelp.csv")

In [5]:
df.head(5)

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
business_id    10000 non-null object
date           10000 non-null object
review_id      10000 non-null object
stars          10000 non-null int64
text           10000 non-null object
type           10000 non-null object
user_id        10000 non-null object
cool           10000 non-null int64
useful         10000 non-null int64
funny          10000 non-null int64
dtypes: int64(4), object(6)
memory usage: 781.4+ KB


In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
stars,10000.0,3.7775,1.214636,1.0,3.0,4.0,5.0,5.0
cool,10000.0,0.8768,2.067861,0.0,0.0,0.0,1.0,77.0
useful,10000.0,1.4093,2.336647,0.0,0.0,1.0,2.0,76.0
funny,10000.0,0.7013,1.907942,0.0,0.0,0.0,1.0,57.0


In [8]:
df["stars"].value_counts()

4    3526
5    3337
3    1461
2     927
1     749
Name: stars, dtype: int64

In [9]:
target = "stars"
X = df.drop(target, axis=1)
y = df[target]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
mapper = DataFrameMapper([
    (["cool"], StandardScaler()),
    (["useful"], StandardScaler()),
    (["funny"], StandardScaler())],df_out=True)

In [12]:
Z_train = mapper.fit_transform(X_train)

In [13]:
Z_test = mapper.transform(X_test)

In [14]:
model = LinearRegression().fit(Z_train,y_train)
print("LinearRegression train score is " + str(model.score(Z_train,y_train)))
print("LinearRegression test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

LinearRegression train score is 0.052481360212042194
LinearRegression test score is 0.006286116552875587
Mean squared error is 1.196116779560479


In [15]:
model = HuberRegressor().fit(Z_train,y_train)
print("HuberRegressor train score is " + str(model.score(Z_train,y_train)))
print("HuberRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

HuberRegressor train score is 0.025855914377079462
HuberRegressor test score is -0.008260813490992502
Mean squared error is 1.204839919324475


In [16]:
model = Lasso().fit(Z_train,y_train)
print("Lasso train score is " + str(model.score(Z_train,y_train)))
print("Lasso test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

Lasso train score is 0.0
Lasso test score is -0.00010679663331658595
Mean squared error is 1.199958132602967


In [17]:
model = Ridge().fit(Z_train,y_train)
print("Ridge train score is " + str(model.score(Z_train,y_train)))
print("Ridge test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

Ridge train score is 0.052481297267932714
Ridge test score is 0.006390922370938523
Mean squared error is 1.1960537013921875


In [18]:
model = ElasticNet().fit(Z_train,y_train)
print("ElasticNet train score is " + str(model.score(Z_train,y_train)))
print("ElasticNet test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

ElasticNet train score is 0.0
ElasticNet test score is -0.00010679663331658595
Mean squared error is 1.199958132602967


In [19]:
model = RandomForestRegressor().fit(Z_train,y_train)
print("RandomForestRegressor train score is " + str(model.score(Z_train,y_train)))
print("RandomForestRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

RandomForestRegressor train score is 0.10836456144109231
RandomForestRegressor test score is 0.017613602354543345
Mean squared error is 1.1892798874494264




In [20]:
model = DecisionTreeRegressor().fit(Z_train,y_train)
print("DecisionTreeRegressor train score is " + str(model.score(Z_train,y_train)))
print("DecisionTreeRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

DecisionTreeRegressor train score is 0.11673772976284136
DecisionTreeRegressor test score is -0.0063123891430423296
Mean squared error is 1.203675203493104


In [21]:
model = AdaBoostRegressor().fit(Z_train,y_train)
print("AdaBoostRegressor train score is " + str(model.score(Z_train,y_train)))
print("AdaBoostRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

AdaBoostRegressor train score is 0.031147327654680312
AdaBoostRegressor test score is 0.021888966560027475
Mean squared error is 1.1866891813800353


In [22]:
model = Sequential([
    Input(shape=(Z_train.shape[1],)),
    Dense(16, activation='elu'),
    Dropout(.05),
    Dense(8, activation='relu'),
    Dropout(.05),
    Dense(4, activation='selu'),
    Dropout(.05),
    Dense(1, activation='softmax')
])

In [23]:
model.compile(loss='mae', optimizer='adam', metrics=["accuracy"])

In [24]:
history = model.fit(Z_train, y_train,
                    validation_data=(Z_test, y_test),
                    epochs=2, batch_size=2,
                    verbose=2)

Train on 7500 samples, validate on 2500 samples
Epoch 1/2
7500/7500 - 7s - loss: 2.7744 - accuracy: 0.0757 - val_loss: 2.7868 - val_accuracy: 0.0724
Epoch 2/2
7500/7500 - 7s - loss: 2.7744 - accuracy: 0.0757 - val_loss: 2.7868 - val_accuracy: 0.0724
