In [1]:
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd
import numpy as np
import csv

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier
from sklearn import set_config

from sklearn.feature_selection import SelectKBest, f_regression

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import seaborn as sns

### Question 1 : How can we know if we had bought a great deal (Our team will create a model to predict price of watch) 
Have you ever wanted to buy a watch but you do not know that the price of the thing that you found is cheap or expensive. That is the reason why we want to ask this question is that our team was in this situations multiple times. We find that it is really time consuming when you have to find the things which we want to buy at a good price. Therefore, we will create a model that can help us and other people who want to buy Rolex save time from searching for a great deal. And if we can solve this problem, we can use this model to predict price of various things not only Rolex watches.

Class to preprocess data

In [2]:
class preprocess(BaseEstimator, TransformerMixin):
    def fit(self, X_df, y=None):
        return self
        
    def transform (self, X_df):
        out_df = X_df.copy()
    
        #Remove watch's size in model name
        out_df['model'].replace(regex = True,to_replace = r"[0-9]",value = '',inplace= True)

        # #Get true size of case
        out_df['case diameter'] = out_df['case diameter'].str.extract(r'(^[\d][\d])')

        #Preprocess ref number because some ref num are in wrong format (including characters,etc) 

        tmp = out_df['reference number'].str.extract(r'(\d+[-]\d+)|(\d+)')
        tmp[0].fillna(tmp[1],inplace=True)
        out_df['reference number'] = tmp[0]

        #Replace all Nan with Unknown
        out_df['reference number'].replace(regex = True,to_replace = "",value = 'Unknown',inplace= True)

        out_df['year of production'] = out_df['year of production'].astype(np.number)
        out_df.loc[out_df['year of production'] < 1905, 'year of production' ] = np.nan
        out_df['year of production'] = out_df['year of production'].astype('object')

        model_list = out_df['model'].unique()
        for i in model_list:
            try:
                out_df[out_df['model'] == i]['price'].fillna(value = out_df[out_df['model'] == i]['price'].mean())
            except:
                pass
            try :
                out_df[out_df['model'] == i] = out_df[out_df['model'] == i].fillna(out_df[out_df['model'] == i].mode().iloc[0])
            except:
                pass
        return out_df

Split data

In [3]:
# Load the initial state of rolex_df with the same state as the one in the explore notebook
rolex_df = pd.read_csv('../rolex_scaper_clean.csv')
rolex_df.drop_duplicates(inplace=True)
rolex_df['year of production'] = rolex_df['year of production'].astype('object')
rolex_df.drop(columns = ['ad name'],inplace=True)
rolex_df = rolex_df[rolex_df['model'] != 'Rolex']
rolex_df.reset_index(drop=True,inplace = True)

In [4]:
tmp  = rolex_df[['model', 'reference number', 'price',
       'movement', 'case material', 'case diameter', 'year of production',
       'condition', 'scope of delivery']].copy()

X_train, X_test= train_test_split(tmp, test_size=0.2, random_state=0)
X_train = preprocess().transform(X_train)
Y_train = X_train['price']
X_train.drop("price", axis=1,inplace=True)

X_test = preprocess().transform(X_test)
Y_test = X_test['price']
X_test.drop("price", axis=1,inplace=True)

Create pipleline 
- For categorical columns, we use one hot encoding to convert it to numerical form and then scale it by using Standard Scaler.
- For numeric columns, we use Standard Scaler to normalize.

In [5]:
categorical_cols = X_train.select_dtypes(exclude=np.number).columns
numerical_cols = X_train.select_dtypes(include = np.number).columns

ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
scaler = StandardScaler()

numerics_pipeline = make_pipeline(scaler)
categorical_pipeline = make_pipeline(ohe,scaler)

col_transformer = make_column_transformer(
    (numerics_pipeline,numerical_cols),
    (categorical_pipeline,categorical_cols),
    remainder='passthrough')
preprocess_pipeline = make_pipeline(preprocess(), col_transformer)


Predict

After using hyper tunning to choose best parameters we go with this 

In [6]:
full_pipeline = make_pipeline(preprocess_pipeline,RandomForestRegressor(n_jobs = -1,max_features = 'sqrt',max_depth = 60,verbose = True))
clf = full_pipeline.fit(X_train,Y_train)
predict_y = clf.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   30.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


Accuracy

RMSE

In [7]:
from sklearn.metrics import mean_squared_error,r2_score
from math import sqrt
sqrt(mean_squared_error(predict_y, Y_test))

17236.921112302633

R^2 score

In [8]:
r2_score(Y_test,predict_y)


0.6770987195526994

Test accuracy

In [9]:
clf.score(X_test,Y_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


0.6770987195526994

Train accuracy

In [10]:
clf.score(X_train,Y_train)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished


0.8600063449729457

Test model

Price of this watch is 12825$

In [11]:
Rolex_test = pd.read_csv('../test_model.csv')


In [12]:
df3 = pd.concat([Rolex_test, X_test], ignore_index = True)

In [13]:
clf.predict(df3)[0]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


13077.723849904036

#### Conclusion

- The accuracy on test set is about **68% and on the training set is approximately 87%**. The model still gives an acceptable prediction on the price of the test watch, although the accuracy of our model on test set is not really high. The true price is `12825 USD` and the predicted price is about `12850 USD`, i think we can count on this model for the next time when we want to but a new Rolex. Basing on this model, when you buy a Rolex, you can know that if you are buying that watch for an acceptable price. Just like for the above watch, if you see price on its tag which is about 14000$ then you can know that its not worth and you can buy it for a better price (about blow `12800 USD`) in other stores. If you own the above watch for a price which is about below `12500 USD`, then congratulations you have an awesome deal.
- We also tried to stack various models in order to increase the accuracy, but it only enhanced about 2-3% and took too long to run (about 2-3 hours running on Google Colab). That is the reason why on this report we only use the RandomForestRegressor.