In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from surprise import Reader, Dataset, SVD

df = pd.read_csv('data/fit.csv')
train = df[0: df.shape[0] // 10 * 8]
validation = df[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]
test = df[df.shape[0] // 10 * 9:]

# SVD
col_names = ['user_id', 'item_id', 'rating']
reader = Reader(rating_scale=(2, 10))
data = Dataset.load_from_df(train[col_names], reader)
data = data.build_full_trainset()
algo = SVD()
algo.fit(data)
svds = []
for user_id, item_id in zip(df['user_id'], df['item_id']):
    svds.append(algo.predict(user_id, item_id).est)
df = df.assign(SVD=svds)
df.to_csv('data/fit_svd.csv', index=False)

In [None]:
df = pd.read_csv('data/fit_svd.csv')
one_hot_rf = pd.get_dummies(df['rented for'])
one_hot_bt = pd.get_dummies(df['body type'])
one_hot_cat = pd.get_dummies(df['category'])

df = df.drop('rented for', axis=1)
df = df.drop('body type', axis=1)
df = df.drop('category', axis=1)

df = df.drop('review date', axis=1)
df = df.drop('fit', axis=1)

ratings = df['rating']
df = df.drop('rating', axis=1)

df = df.join(one_hot_rf)
df = df.join(one_hot_bt)
df = df.join(one_hot_cat)

train = df[0: df.shape[0] // 10 * 8]
train_ratings = ratings[0: df.shape[0] // 10 * 8]
validation = df[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]
validation_ratings = ratings[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]

In [None]:
mse = mean_squared_error(validation_ratings, df['SVD'][df.shape[0] // 10 * 8: df.shape[0] // 10 * 9])
print('MSE of SVD:')
print(mse)

In [None]:
reg = LinearRegression().fit(train, train_ratings)
predictions = reg.predict(validation)
mse = mean_squared_error(validation_ratings, predictions)
print('MSE of combining SVD and linear regression:')
print(mse)

In [45]:
df = pd.read_csv('data/fit.csv')
one_hot_rf = pd.get_dummies(df['rented for'])
one_hot_bt = pd.get_dummies(df['body type'])
one_hot_cat = pd.get_dummies(df['category'])

df = df.drop('rented for', axis=1)
df = df.drop('body type', axis=1)
df = df.drop('category', axis=1)

df = df.drop('review date', axis=1)
df = df.drop('fit', axis=1)

ratings = df['rating']
df = df.drop('rating', axis=1)

df = df.join(one_hot_rf)
df = df.join(one_hot_bt)
df = df.join(one_hot_cat)

train = df[0: df.shape[0] // 10 * 8]
train_ratings = ratings[0: df.shape[0] // 10 * 8]
validation = df[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]
validation_ratings = ratings[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]

reg = LinearRegression().fit(train, train_ratings)
predictions = reg.predict(validation)
mse = mean_squared_error(validation_ratings, predictions)
print('MSE of linear regression:')
print(mse)

MSE of linear regression:
1.9862645229992368


In [65]:
df = pd.read_csv('data/fit.csv')
attrs = ['bust size', 'cup size', 'weight', 'height', 'size', 'age', 'review length', 'number of excl']
for attr in attrs:
    scale = (df[attr]-df[attr].min())/(df[attr].max()-df[attr].min())
    df = df.drop(attr, axis=1)
    df[attr] = scale
print(df)

        fit  user_id  item_id  rating     rented for          body type  \
0       fit   420272  2260466    10.0       vacation          hourglass   
1       fit   273551   153475    10.0          other  straight & narrow   
2       fit   909926   126335     8.0  formal affair               pear   
3       fit   151944   616682    10.0        wedding           athletic   
4       fit   734848   364092     8.0           date           athletic   
...     ...      ...      ...     ...            ...                ...   
146376  fit    66386  2252812    10.0           work          hourglass   
146377  fit   118398   682043    10.0           work             petite   
146378  fit    47002   683251     6.0       everyday  straight & narrow   
146379  fit   961120   126335    10.0        wedding               pear   
146380  fit   123612   127865    10.0        wedding           athletic   

        category         review date  bust size  cup size  weight    height  \
0         romper    

In [66]:
one_hot_rf = pd.get_dummies(df['rented for'])
one_hot_bt = pd.get_dummies(df['body type'])
one_hot_cat = pd.get_dummies(df['category'])

df = df.drop('rented for', axis=1)
df = df.drop('body type', axis=1)
df = df.drop('category', axis=1)
df = df.drop('review date', axis=1)

df = df.join(one_hot_rf)
df = df.join(one_hot_bt)
df = df.join(one_hot_cat)

df.to_csv('data/fit_norm.csv', index=False)

In [67]:
df = pd.read_csv('data/fit_norm.csv')
df = df.drop('fit', axis=1)
ratings = df['rating']
df = df.drop('rating', axis=1)

train = df[0: df.shape[0] // 10 * 8]
train_ratings = ratings[0: df.shape[0] // 10 * 8]
validation = df[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]
validation_ratings = ratings[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]

reg = LinearRegression().fit(train, train_ratings)
predictions = reg.predict(validation)
mse = mean_squared_error(validation_ratings, predictions)
print('MSE of linear regression:')
print(mse)

MSE of linear regression:
1.9862645229932285


In [70]:
df = pd.read_csv('data/fit_svd.csv')
attrs = ['bust size', 'cup size', 'weight', 'height', 'size', 'age', 'review length', 'number of excl']
for attr in attrs:
    scale = (df[attr]-df[attr].min())/(df[attr].max()-df[attr].min())
    df = df.drop(attr, axis=1)
    df[attr] = scale
print(df)

one_hot_rf = pd.get_dummies(df['rented for'])
one_hot_bt = pd.get_dummies(df['body type'])
one_hot_cat = pd.get_dummies(df['category'])

df = df.drop('rented for', axis=1)
df = df.drop('body type', axis=1)
df = df.drop('category', axis=1)
df = df.drop('review date', axis=1)

df = df.join(one_hot_rf)
df = df.join(one_hot_bt)
df = df.join(one_hot_cat)

df.to_csv('data/fit_norm_svd.csv', index=False)

        fit  user_id  item_id  rating     rented for          body type  \
0       fit   420272  2260466    10.0       vacation          hourglass   
1       fit   273551   153475    10.0          other  straight & narrow   
2       fit   909926   126335     8.0  formal affair               pear   
3       fit   151944   616682    10.0        wedding           athletic   
4       fit   734848   364092     8.0           date           athletic   
...     ...      ...      ...     ...            ...                ...   
146376  fit    66386  2252812    10.0           work          hourglass   
146377  fit   118398   682043    10.0           work             petite   
146378  fit    47002   683251     6.0       everyday  straight & narrow   
146379  fit   961120   126335    10.0        wedding               pear   
146380  fit   123612   127865    10.0        wedding           athletic   

        category         review date       SVD  bust size  cup size  weight  \
0         romper    

In [72]:
df = pd.read_csv('data/fit_norm_svd.csv')
df = df.drop('fit', axis=1)
ratings = df['rating']
df = df.drop('rating', axis=1)

train = df[0: df.shape[0] // 10 * 8]
train_ratings = ratings[0: df.shape[0] // 10 * 8]
validation = df[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]
validation_ratings = ratings[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]

reg = LinearRegression().fit(train, train_ratings)
predictions = reg.predict(validation)
mse = mean_squared_error(validation_ratings, predictions)
print('MSE of linear regression with SVD:')
print(mse)

MSE of linear regression with SVD:
2.10008061487982


In [85]:
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv('data/fit_norm.csv')
df = df.drop('fit', axis=1)
ratings = df['rating']
df = df.drop('rating', axis=1)

train = df[0: df.shape[0] // 10 * 8]
train_ratings = ratings[0: df.shape[0] // 10 * 8]
validation = df[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]
validation_ratings = ratings[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]

reg = RandomForestRegressor(max_depth=3, random_state=0, n_estimators=100)
reg.fit(train, train_ratings)
predictions = reg.predict(validation)
mse = mean_squared_error(validation_ratings, predictions)
print('MSE of linear regression with RF:')
print(mse)

MSE of linear regression with RF:
1.9734695708749401


In [81]:
from sklearn.neural_network import MLPRegressor

reg = MLPRegressor()
reg.fit(train, train_ratings)
predictions = reg.predict(validation)
mse = mean_squared_error(validation_ratings, predictions)
print('MSE of linear regression with MLP:')
print(mse)

MSE of linear regression with MLP:
1842.821780462671


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

reg = GradientBoostingRegressor()
reg.fit(train, train_ratings)
predictions = reg.predict(validation)
mse = mean_squared_error(validation_ratings, predictions)
print('MSE of linear regression with GDBT:')
print(mse)

In [None]:
# classifier
from sklearn.svm import SVC
import pandas as pd

df = pd.read_csv('data/fit_norm.csv')
print(df)

df = df.drop('rating', axis=1)
fit = df['fit']
df = df.drop('fit', axis=1)

y = []
for f in fit:
    if f == "small":
        y.append(-1)
    elif f == "fit":
        y.append(0)
    else:
        y.append(1)

train = df[0: df.shape[0] // 10 * 8]
train_fit = y[0: len(y) // 10 * 8]
validation = df[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]
validation_fit = y[len(y) // 10 * 8: len(y) // 10 * 9]
clf = SVC()
clf.fit(train, train_fit) 
predictions = clf.predict(validation)
accuracy = accuracy_score(validation_fit, predictions)
print(accuracy)

        fit  user_id  item_id  rating  bust size  cup size  weight    height  \
0       fit   420272  2260466    10.0        0.3  0.333333   0.348  0.583333   
1       fit   273551   153475    10.0        0.3  0.111111   0.328  0.500000   
2       fit   909926   126335     8.0        0.3  0.222222   0.340  0.458333   
3       fit   151944   616682    10.0        0.3  0.111111   0.380  0.625000   
4       fit   734848   364092     8.0        0.2  0.111111   0.352  0.583333   
...     ...      ...      ...     ...        ...       ...     ...       ...   
146376  fit    66386  2252812    10.0        0.3  0.333333   0.360  0.625000   
146377  fit   118398   682043    10.0        0.2  0.222222   0.200  0.291667   
146378  fit    47002   683251     6.0        0.4  0.000000   0.340  0.583333   
146379  fit   961120   126335    10.0        0.4  0.222222   0.460  0.500000   
146380  fit   123612   127865    10.0        0.4  0.111111   0.420  0.500000   

            size       age  ...  tank  



In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier()
clf.fit(train, train_fit) 
predictions = clf.predict(validation)
accuracy = accuracy_score(validation_fit, predictions)
print(accuracy)



0.7160814318896024


In [3]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

clf = GradientBoostingClassifier()
clf.fit(train, train_fit) 
predictions = clf.predict(validation)
accuracy = accuracy_score(validation_fit, predictions)
print(accuracy)

0.7406749555950266


In [4]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(train, train_fit) 
predictions = clf.predict(validation)
accuracy = accuracy_score(validation_fit, predictions)
print(accuracy)



0.7395819100970078
