In [1]:
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd
import numpy as np
import csv

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier
from sklearn import set_config

from sklearn.feature_selection import SelectKBest, f_regression

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import seaborn as sns

---
# Collecting data
Our data is about Rolex watches which are sold on `chrono24.com`.

Yes, He allows us to freely use the dataset.

He collected this data by scraping with Selenium

---
# Exploring data

In [2]:
rolex_df = pd.read_csv('rolex_scaper_clean.csv')

#### Number of rows and columns

In [3]:
num_rows,num_cols = rolex_df.shape
rolex_df.shape

(87117, 12)

#### Meaning of each row
We can see that each row has information like model, price, ... about the watch.

In [4]:
rolex_df.head()

Unnamed: 0,model,reference number,price,aditional shipping price,ad name,movement,case material,case diameter,year of production,condition,scope of delivery,location
0,Rolex Lady-Datejust,179161,9080.0,140.0,Steel Rose Gold Black Roman Dial Ladies Watch ...,Automatic,Steel,26 mm,2014.0,Very good,"Original box, original papers","United States of America, Georgia"
1,Rolex Chronograph,2917,16202.0,216.0,REF. 2917,Manual winding,Steel,33 mm,1934.0,Very good,"Original box, no original papers","Italy, Roma"
2,Rolex Daytona,116519G,41567.0,0.0,保証書付き ROLEX ロレックス デイトナ コスモグラフ K18WG 8Pダイヤ ランダム...,Automatic,White gold,39 mm,,Fair,"Original papers, no original box","Japan, Nagoya City"
3,Rolex Submariner Date,116613,19795.0,235.0,New Submariner 116613 Yellow Steel Gold Cerami...,Automatic,Steel,40 mm,2020.0,Unworn,"Original box, original papers","United States of America, Florida, Miami"
4,Rolex Submariner Date,16610,10674.0,145.0,1990s ROLEX SUBMARINER 16610 Tritium vintage G...,Automatic,,40 mm,1990.0,Good,"Original box, original papers","Japan, Ehime yawatahama"


#### Are there duplicated rows ?

In [5]:
rolex_df.duplicated(keep='first').sum()

21898

Drop duplicated rows

In [6]:
rolex_df.drop_duplicates(inplace=True)

#### Meaning of each columns ?

model: the watch model name

reference number: the number to identify the watch model as a whole

price: the price on the listing (price made by the reseller)

additional shipping price: the price for shipping (0 = free shipping)

ad name: the name of the listing on the site

movement: the engine of a watch to make the watch and its functions operate

case material: the material of the external watch case

case diameter: the dimensions of the watch

year of production: the year when Rolex fabricate that particular watch

condition: the general condition of the watch

scope of delivery: the additional things like warranty, or box that could come with the watch

location: the location of the reseller


#### Type of each colum 
Luckily, all columns are in their right data types except for year of production. It should be categorical (object) instead of float64.

In [7]:
rolex_df.dtypes 

model                        object
reference number             object
price                       float64
aditional shipping price    float64
ad name                      object
movement                     object
case material                object
case diameter                object
year of production          float64
condition                    object
scope of delivery            object
location                     object
dtype: object

Change data types of year of production column

In [8]:
rolex_df['year of production'] = rolex_df['year of production'].astype('object')
rolex_df.dtypes

model                        object
reference number             object
price                       float64
aditional shipping price    float64
ad name                      object
movement                     object
case material                object
case diameter                object
year of production           object
condition                    object
scope of delivery            object
location                     object
dtype: object

#### With each numerical column, how are values distributed?

In [9]:
numerical_cols = rolex_df.select_dtypes(include=np.number).copy()
numerical_cols.head()

Unnamed: 0,price,aditional shipping price
0,9080.0,140.0
1,16202.0,216.0
2,41567.0,0.0
3,19795.0,235.0
4,10674.0,145.0


Proportion of missing values

In [10]:
num_missing_percentages = (numerical_cols.isna()).sum() / rolex_df.shape[0] * 100
num_missing_percentages

price                       6.271179
aditional shipping price    0.000000
dtype: float64

Describe numerical/categorical columns

In [11]:
def describe__(df):
    res_df = pd.DataFrame(columns=[df.columns])

    try:
        res_df.loc['missing_ratio'] = np.asarray(missing_percentages[df.columns])
    except:
        res_df.loc['missing_ratio'] = np.asarray((df.isna()).sum() / df.shape[0] * 100)
        
    res_df.loc['num_unique'] = np.asarray(df.nunique())
    res_df.loc['unique'] = np.asarray([df[col].unique() for col in df], dtype=object)

    return res_df

Describe

In [12]:
numerical_cols.describe()

Unnamed: 0,price,aditional shipping price
count,61129.0,65219.0
mean,22764.85,134.830234
std,30596.92,2580.04271
min,315.0,0.0
25%,9157.0,31.0
50%,15000.0,99.0
75%,24833.0,160.0
max,1152102.0,656389.0


In [13]:
describe__(numerical_cols)

Unnamed: 0,price,aditional shipping price
missing_ratio,6.271179,0.0
num_unique,15492.0,299.0
unique,"[9080.0, 16202.0, 41567.0, 19795.0, 10674.0, 2...","[140.0, 216.0, 0.0, 235.0, 145.0, 75.0, 189.0,..."


#### With each categorical column, how are values distributed?

In [14]:
categorical_cols = rolex_df.select_dtypes(exclude=np.number).copy()
categorical_cols.head()

Unnamed: 0,model,reference number,ad name,movement,case material,case diameter,year of production,condition,scope of delivery,location
0,Rolex Lady-Datejust,179161,Steel Rose Gold Black Roman Dial Ladies Watch ...,Automatic,Steel,26 mm,2014.0,Very good,"Original box, original papers","United States of America, Georgia"
1,Rolex Chronograph,2917,REF. 2917,Manual winding,Steel,33 mm,1934.0,Very good,"Original box, no original papers","Italy, Roma"
2,Rolex Daytona,116519G,保証書付き ROLEX ロレックス デイトナ コスモグラフ K18WG 8Pダイヤ ランダム...,Automatic,White gold,39 mm,,Fair,"Original papers, no original box","Japan, Nagoya City"
3,Rolex Submariner Date,116613,New Submariner 116613 Yellow Steel Gold Cerami...,Automatic,Steel,40 mm,2020.0,Unworn,"Original box, original papers","United States of America, Florida, Miami"
4,Rolex Submariner Date,16610,1990s ROLEX SUBMARINER 16610 Tritium vintage G...,Automatic,,40 mm,1990.0,Good,"Original box, original papers","Japan, Ehime yawatahama"


In [15]:
categorical_cols.describe()

Unnamed: 0,model,reference number,ad name,movement,case material,case diameter,year of production,condition,scope of delivery,location
count,65219,63127,65161,62855,61721,62312,48335.0,64301,65219,65219
unique,58,4863,44292,3,13,643,111.0,7,4,4097
top,Rolex Datejust 36,126334,126334,Automatic,Steel,40 mm,2022.0,Very good,"Original box, original papers","United States of America, New York, New York"
freq,8026,1551,333,60696,35519,21627,12016.0,33768,41707,5303


In [16]:
describe__(categorical_cols)

Unnamed: 0,model,reference number,ad name,movement,case material,case diameter,year of production,condition,scope of delivery,location
missing_ratio,0.0,3.207654,0.088931,3.624711,5.363468,4.45729,25.888161,1.407565,0.0,0.0
num_unique,58.0,4863.0,44292.0,3.0,13.0,643.0,111.0,7.0,4.0,4097.0
unique,"[Rolex Lady-Datejust, Rolex Chronograph, Rolex...","[179161, 2917, 116519G, 116613, 16610, 126331,...",[Steel Rose Gold Black Roman Dial Ladies Watch...,"[Automatic, Manual winding, nan, Quartz]","[Steel, White gold, nan, Gold/Steel, Yellow go...","[26 mm, 33 mm, 39 mm, 40 mm, 41 mm, 36 mm, 31 ...","[2014.0, 1934.0, nan, 2020.0, 1990.0, 2022.0, ...","[Very good, Fair, Unworn, Good, New, nan, Poor...","[Original box, original papers, Original box, ...","[United States of America, Georgia, Italy, Rom..."


Percentage of missing values

In [17]:
cate_missing_percentages = (categorical_cols.isna()).sum() / rolex_df.shape[0] * 100
cate_missing_percentages

model                  0.000000
reference number       3.207654
ad name                0.088931
movement               3.624711
case material          5.363468
case diameter          4.457290
year of production    25.888161
condition              1.407565
scope of delivery      0.000000
location               0.000000
dtype: float64

Number of different values

In [18]:
categorical_cols.nunique()

model                    58
reference number       4863
ad name               44292
movement                  3
case material            13
case diameter           643
year of production      111
condition                 7
scope of delivery         4
location               4097
dtype: int64

---

# Asking meaningful questions 

In [19]:
rolex_df.drop(columns = ['ad name'],inplace=True)
rolex_df = rolex_df[rolex_df['model'] != 'Rolex']

In [20]:
rolex_df.reset_index(drop=True,inplace = True)
rolex_df.head()

Unnamed: 0,model,reference number,price,aditional shipping price,movement,case material,case diameter,year of production,condition,scope of delivery,location
0,Rolex Lady-Datejust,179161,9080.0,140.0,Automatic,Steel,26 mm,2014.0,Very good,"Original box, original papers","United States of America, Georgia"
1,Rolex Chronograph,2917,16202.0,216.0,Manual winding,Steel,33 mm,1934.0,Very good,"Original box, no original papers","Italy, Roma"
2,Rolex Daytona,116519G,41567.0,0.0,Automatic,White gold,39 mm,,Fair,"Original papers, no original box","Japan, Nagoya City"
3,Rolex Submariner Date,116613,19795.0,235.0,Automatic,Steel,40 mm,2020.0,Unworn,"Original box, original papers","United States of America, Florida, Miami"
4,Rolex Submariner Date,16610,10674.0,145.0,Automatic,,40 mm,1990.0,Good,"Original box, original papers","Japan, Ehime yawatahama"


Check if dataset still consists missing values

In [21]:
missing_percentages = rolex_df.isna().sum()/rolex_df.shape[0]*100
missing_percentages

model                        0.000000
reference number             2.672060
price                        6.321740
aditional shipping price     0.000000
movement                     3.574719
case material                5.241048
case diameter                4.289975
year of production          25.630534
condition                    1.361798
scope of delivery            0.000000
location                     0.000000
dtype: float64

In [22]:
# remove ad name from categorical columns
new_cat_cols = pd.DataFrame(categorical_cols.drop(columns = ['ad name'],inplace=False))
new_cat_cols.head(3)

Unnamed: 0,model,reference number,movement,case material,case diameter,year of production,condition,scope of delivery,location
0,Rolex Lady-Datejust,179161,Automatic,Steel,26 mm,2014.0,Very good,"Original box, original papers","United States of America, Georgia"
1,Rolex Chronograph,2917,Manual winding,Steel,33 mm,1934.0,Very good,"Original box, no original papers","Italy, Roma"
2,Rolex Daytona,116519G,Automatic,White gold,39 mm,,Fair,"Original papers, no original box","Japan, Nagoya City"


Take a look at categorical/numerical columns missing ratio and unique values 

In [23]:
describe__(numerical_cols)

Unnamed: 0,price,aditional shipping price
missing_ratio,6.32174,0.0
num_unique,15492.0,299.0
unique,"[9080.0, 16202.0, 41567.0, 19795.0, 10674.0, 2...","[140.0, 216.0, 0.0, 235.0, 145.0, 75.0, 189.0,..."


In [24]:
describe__(new_cat_cols)

Unnamed: 0,model,reference number,movement,case material,case diameter,year of production,condition,scope of delivery,location
missing_ratio,0.0,2.67206,3.574719,5.241048,4.289975,25.630534,1.361798,0.0,0.0
num_unique,58.0,4863.0,3.0,13.0,643.0,111.0,7.0,4.0,4097.0
unique,"[Rolex Lady-Datejust, Rolex Chronograph, Rolex...","[179161, 2917, 116519G, 116613, 16610, 126331,...","[Automatic, Manual winding, nan, Quartz]","[Steel, White gold, nan, Gold/Steel, Yellow go...","[26 mm, 33 mm, 39 mm, 40 mm, 41 mm, 36 mm, 31 ...","[2014.0, 1934.0, nan, 2020.0, 1990.0, 2022.0, ...","[Very good, Fair, Unworn, Good, New, nan, Poor...","[Original box, original papers, Original box, ...","[United States of America, Georgia, Italy, Rom..."
