In [32]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from surprise import Reader, Dataset, SVD

df = pd.read_csv('data/fit.csv')
train = df[0: df.shape[0] // 10 * 8]
validation = df[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]
test = df[df.shape[0] // 10 * 9:]

# SVD
col_names = ['user_id', 'item_id', 'rating']
reader = Reader(rating_scale=(2, 10))
data = Dataset.load_from_df(train[col_names], reader)
data = data.build_full_trainset()
algo = SVD()
algo.fit(data)
svds = []
for user_id, item_id in zip(df['user_id'], df['item_id']):
    svds.append(algo.predict(user_id, item_id).est)
df = df.assign(SVD=svds)
df.to_csv('data/fit_svd.csv', index=False)

In [42]:
df = pd.read_csv('data/fit_svd.csv')
one_hot_rf = pd.get_dummies(df['rented for'])
one_hot_bt = pd.get_dummies(df['body type'])
one_hot_cat = pd.get_dummies(df['category'])

df = df.drop('rented for', axis=1)
df = df.drop('body type', axis=1)
df = df.drop('category', axis=1)

df = df.drop('review date', axis=1)
df = df.drop('fit', axis=1)

ratings = df['rating']
df = df.drop('rating', axis=1)

df = df.join(one_hot_rf)
df = df.join(one_hot_bt)
df = df.join(one_hot_cat)

train = df[0: df.shape[0] // 10 * 8]
train_ratings = ratings[0: df.shape[0] // 10 * 8]
validation = df[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]
validation_ratings = ratings[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]

In [43]:
mse = mean_squared_error(validation_ratings, df['SVD'][df.shape[0] // 10 * 8: df.shape[0] // 10 * 9])
print('MSE of SVD:')
print(mse)

MSE of SVD:
1.977126697188415


In [44]:
reg = LinearRegression().fit(train, train_ratings)
predictions = reg.predict(validation)
mse = mean_squared_error(validation_ratings, predictions)
print('MSE of combining SVD and linear regression:')
print(mse)

MSE of combining SVD and linear regression:
2.100080614895229


In [45]:
df = pd.read_csv('data/fit.csv')
one_hot_rf = pd.get_dummies(df['rented for'])
one_hot_bt = pd.get_dummies(df['body type'])
one_hot_cat = pd.get_dummies(df['category'])

df = df.drop('rented for', axis=1)
df = df.drop('body type', axis=1)
df = df.drop('category', axis=1)

df = df.drop('review date', axis=1)
df = df.drop('fit', axis=1)

ratings = df['rating']
df = df.drop('rating', axis=1)

df = df.join(one_hot_rf)
df = df.join(one_hot_bt)
df = df.join(one_hot_cat)

train = df[0: df.shape[0] // 10 * 8]
train_ratings = ratings[0: df.shape[0] // 10 * 8]
validation = df[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]
validation_ratings = ratings[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]

reg = LinearRegression().fit(train, train_ratings)
predictions = reg.predict(validation)
mse = mean_squared_error(validation_ratings, predictions)
print('MSE of linear regression:')
print(mse)

MSE of linear regression:
1.9862645229992368


In [50]:
df = pd.read_csv('data/fit.csv')
attrs = ['bust size', 'cup size', 'weight', 'rating', 'height', 'size', 'age', 'review length', 'number of excl']
for attr in attrs:
    scale = (df[attr]-df[attr].min())/(df[attr].max()-df[attr].min())
    df = df.drop(attr, axis=1)
    df[attr] = scale
print(df)

        fit  user_id  item_id     rented for          body type  category  \
0       fit   420272  2260466       vacation          hourglass    romper   
1       fit   273551   153475          other  straight & narrow      gown   
2       fit   909926   126335  formal affair               pear     dress   
3       fit   151944   616682        wedding           athletic      gown   
4       fit   734848   364092           date           athletic     dress   
...     ...      ...      ...            ...                ...       ...   
146376  fit    66386  2252812           work          hourglass  jumpsuit   
146377  fit   118398   682043           work             petite     dress   
146378  fit    47002   683251       everyday  straight & narrow     dress   
146379  fit   961120   126335        wedding               pear     dress   
146380  fit   123612   127865        wedding           athletic      gown   

               review date  bust size  cup size  weight  rating    height  

In [51]:
one_hot_rf = pd.get_dummies(df['rented for'])
one_hot_bt = pd.get_dummies(df['body type'])
one_hot_cat = pd.get_dummies(df['category'])

df = df.drop('rented for', axis=1)
df = df.drop('body type', axis=1)
df = df.drop('category', axis=1)
df = df.drop('review date', axis=1)

df = df.join(one_hot_rf)
df = df.join(one_hot_bt)
df = df.join(one_hot_cat)

df.to_csv('data/fit_norm.csv')

In [52]:
df.to_csv('data/fit_norm.csv', index=False)

In [53]:
df = pd.read_csv()
df = df.drop('fit', axis=1)
ratings = df['rating']
df = df.drop('rating', axis=1)

train = df[0: df.shape[0] // 10 * 8]
train_ratings = ratings[0: df.shape[0] // 10 * 8]
validation = df[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]
validation_ratings = ratings[df.shape[0] // 10 * 8: df.shape[0] // 10 * 9]

reg = LinearRegression().fit(train, train_ratings)
predictions = reg.predict(validation)
mse = mean_squared_error(validation_ratings, predictions)
print('MSE of linear regression with SVD:')
print(mse)

MSE of linear regression:
0.031035383171769192


In [54]:
df = pd.read_csv('data/fit_svd.csv')
attrs = ['bust size', 'cup size', 'weight', 'rating', 'height', 'size', 'age', 'review length', 'number of excl']
for attr in attrs:
    scale = (df[attr]-df[attr].min())/(df[attr].max()-df[attr].min())
    df = df.drop(attr, axis=1)
    df[attr] = scale
print(df)
df.to_csv('data/fit_norm_svd.csv', index=False)

        fit  user_id  item_id     rented for          body type  category  \
0       fit   420272  2260466       vacation          hourglass    romper   
1       fit   273551   153475          other  straight & narrow      gown   
2       fit   909926   126335  formal affair               pear     dress   
3       fit   151944   616682        wedding           athletic      gown   
4       fit   734848   364092           date           athletic     dress   
...     ...      ...      ...            ...                ...       ...   
146376  fit    66386  2252812           work          hourglass  jumpsuit   
146377  fit   118398   682043           work             petite     dress   
146378  fit    47002   683251       everyday  straight & narrow     dress   
146379  fit   961120   126335        wedding               pear     dress   
146380  fit   123612   127865        wedding           athletic      gown   

               review date       SVD  bust size  cup size  weight  rating  