In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns

In [130]:
df = pd.read_csv("data/raw_csw.csv")

In [131]:
df.columns

Index(['Unnamed: 0', 'name', 'price', 'Brand', 'Collection', 'Series',
       'Model No', 'Features', 'Movement', 'Calibre', 'Case Size',
       'Case Thickness', 'Case Shape', 'Case Material', 'Case Back',
       'Glass Material', 'Luminosity', 'Dial Colour', 'Hands', 'Indexes',
       'Strap Material', 'Strap Colour', 'Clasp Type', 'Buckle/Clasp Material',
       'EAN', 'Gender', 'Water Resistance (M)', 'Warranty Period',
       'Country of Origin', 'Reference', 'Display', 'Power Reserve', 'Jewels',
       'Frequency', 'Precious Stone', 'Interchangeable Strap', 'Bezel',
       'Lug Width', 'Limited Edition', 'Diameter', 'Date', 'Chronograph',
       'Base', 'Frequency (bph)', 'Power Reserve (hours)', 'jewels',
       'Dial Type'],
      dtype='object')

In [132]:
df = df.drop(columns=['Unnamed: 0', 'name', 'Date', 'Chronograph', 'jewels', 'Lug Width', 'Diameter', 'Dial Type', 'Reference', 'Model No', 'EAN',
                'Case Back', 'Frequency', 'Base', 'Power Reserve (hours)'])

In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5749 entries, 0 to 5748
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   price                  5749 non-null   int64  
 1   Brand                  5749 non-null   object 
 2   Collection             5749 non-null   object 
 3   Series                 2961 non-null   object 
 4   Features               4737 non-null   object 
 5   Movement               5742 non-null   object 
 6   Calibre                4303 non-null   object 
 7   Case Size              5749 non-null   float64
 8   Case Thickness         4230 non-null   float64
 9   Case Shape             5749 non-null   object 
 10  Case Material          5747 non-null   object 
 11  Glass Material         5741 non-null   object 
 12  Luminosity             3448 non-null   object 
 13  Dial Colour            5749 non-null   object 
 14  Hands                  3133 non-null   object 
 15  Inde

In [150]:
def count_median(feature):
    counts = df[feature].value_counts()

    median_prices = df.groupby(feature)['price'].median().sort_values(ascending=False)

    result_df = pd.DataFrame({
        'Count': counts,
        'Median Price': median_prices
    }).sort_values(by='Median Price', ascending=False)

    return result_df

In [134]:
brand_counts = df['Brand'].value_counts()
popular_brands = brand_counts[brand_counts >= 25].index 
df['Brand'] = df['Brand'].apply(lambda x: x if x in popular_brands else "Others")

In [151]:
count_median('Dial Colour')

Unnamed: 0_level_0,Count,Median Price
Dial Colour,Unnamed: 1_level_1,Unnamed: 2_level_1
Diamond pavé,2,12524500.0
Vantablack®,3,3000000.0
Blue MOP,8,1750000.0
Black & Grey,1,1709500.0
Skeleton,328,1122000.0
...,...,...
Brown MOP,2,149600.0
Blue & Silver,6,102000.0
Silver & Black,3,90000.0
Light Blue,1,76000.0


We can drop the rows with less frquent colours (<10)

In [136]:
df['Water Resistance (M)'] = df['Water Resistance (M)'].replace('Splash Resistant', 30).astype(float)

In [137]:
df['Power Reserve'] = df['Power Reserve'].str.extract('(\d+)', expand=False).astype(float)

In [138]:
df['Jewels'] = df['Jewels'].fillna(0)

In [152]:
count_median('Glass Material')

Unnamed: 0_level_0,Count,Median Price
Glass Material,Unnamed: 1_level_1,Unnamed: 2_level_1
Sapphire Crystal,5400,322100.0
Hardlex Crystal,50,33000.0
Others,26,29600.0
Mineral Crystal,273,26400.0


In [140]:
to_keep = counts[counts >= 25].index 
df['Glass Material'] = df['Glass Material'].apply(lambda x: x if x in to_keep else "Others")

In [141]:
df['Warranty Period'] = df['Warranty Period'].str.get(0).astype(float)

In [145]:
df['Bezel'] = df['Bezel'].fillna('No Bezel')

In [153]:
count_median('Precious Stone')

Unnamed: 0_level_0,Count,Median Price
Precious Stone,Unnamed: 1_level_1,Unnamed: 2_level_1
"On Case , Dial & Bracelet",1,18949000.0
"On Case , Crown & Clasp",4,9700000.0
"On Case, Dial",2,7560000.0
"On Bezel, Lugs & Crown",1,6900000.0
"On Case, Dial & Buckle",1,6100000.0
"On Case, Dial & Lugs",2,5750000.0
On Bezel & Bracelet,2,5274000.0
"On Bezel, Dial & Crown",6,4800000.0
On Bezel & Lugs,2,3120000.0
"On Bezel, Dial & Lugs",2,2790000.0


In [159]:
precious_stones_df = pd.DataFrame()
parts = ["Case", "Dial", "Bracelet", "Crown", "Clasp", "Bezel", "Lugs", "Buckle"]

for part in parts:
    precious_stones_df[f"precious_stone_on_{part}"] = df['Precious Stone'].fillna('').apply(lambda x: 1 if part in str(x) else 0)

In [163]:
precious_stones_df.head()

Unnamed: 0,precious_stone_on_Case,precious_stone_on_Dial,precious_stone_on_Bracelet,precious_stone_on_Crown,precious_stone_on_Clasp,precious_stone_on_Bezel,precious_stone_on_Lugs,precious_stone_on_Buckle
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0


In [164]:
df['Features']

0                        Date
1                        Date
2         Date, Small Seconds
3                        Date
4                        Date
                ...          
5744                      NaN
5745                      NaN
5746    Chronograph, Day-Date
5747                     Date
5748                     Date
Name: Features, Length: 5749, dtype: object

In [165]:
string = ', '.join(list(df['Features'].dropna().unique()))
features = list(set(string.split(', ')))

In [166]:
features

['Day-Date',
 'Notifications',
 'Year',
 'Second Time Zone',
 'UTC',
 'Incoming Calls',
 'Smart Watch',
 'Triple Time-zone',
 'Gyroscope',
 'Equation Of Time',
 'Retrograde Date',
 'Amold Touch Screen',
 'Retrograde Day',
 'Countdown Indicator',
 'Week',
 'Multifunction',
 'Retrograde',
 'Depthmeter',
 'Bluetooth',
 'Day-Night Indicator',
 'Link With Smart Phone',
 'Perpetual Calendar',
 'Power Reserve Indicator',
 'Double Moon Phase',
 'Flyback Chronograph',
 'Day-Month',
 'Speedometer',
 'Compass',
 'Retrograde Minutes',
 'Distance',
 'GMT',
 'Heart Beat',
 'Flying Tourbillon',
 '1/100th Seconds',
 'Activity Tracker',
 'Retrograde Jumping Hours',
 'Power Saving',
 'Open Balance Wheel',
 'Tourbillon',
 '1/10th Seconds',
 'Calories',
 'Small 180 Seconds',
 'Jumping Minutes',
 'Small Hacking Seconds',
 'Activity Monitor',
 'Rotating Bezel',
 '1/5th Seconds',
 'Altimeter',
 'GPS',
 '60 Seconds Tourbillon',
 'Annual Calendar',
 'Tachymeter',
 'Oil Temperature',
 'Alarm',
 'EOL Indicator',

In [173]:
df['Features']

0                        Date
1                        Date
2         Date, Small Seconds
3                        Date
4                        Date
                ...          
5744                      NaN
5745                      NaN
5746    Chronograph, Day-Date
5747                     Date
5748                     Date
Name: Features, Length: 5749, dtype: object

In [174]:
features_df = pd.DataFrame()

for feature in features:
    features_df[f"feature_{feature}"] = df['Features'].fillna('').apply(lambda x: 1 if feature in x else 0)

In [178]:
features_df = features_df.loc[:, features_df.apply(lambda col: col.sum() >= 10)]

In [182]:
features_df

Unnamed: 0,feature_Day-Date,feature_Notifications,feature_Year,feature_Incoming Calls,feature_Triple Time-zone,feature_Amold Touch Screen,feature_Retrograde,feature_Day-Night Indicator,feature_Perpetual Calendar,feature_Power Reserve Indicator,...,feature_Moon Phase,feature_Helium Escape Valve,feature_Month,feature_Telemeter,feature_Anti-Magnetic,feature_Chronograph,feature_24 Hour Indicator,feature_Date,feature_Small Seconds,feature_1/4th seconds
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5744,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5745,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5746,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
5747,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [179]:
for i in features_df.columns:
    print("Value count of " + i)
    print(features_df[i].value_counts())
    print("-----")

Value count of feature_Day-Date
feature_Day-Date
0    5422
1     327
Name: count, dtype: int64
-----
Value count of feature_Notifications
feature_Notifications
0    5737
1      12
Name: count, dtype: int64
-----
Value count of feature_Year
feature_Year
0    5735
1      14
Name: count, dtype: int64
-----
Value count of feature_Incoming Calls
feature_Incoming Calls
0    5737
1      12
Name: count, dtype: int64
-----
Value count of feature_Triple Time-zone
feature_Triple Time-zone
0    5738
1      11
Name: count, dtype: int64
-----
Value count of feature_Amold Touch Screen
feature_Amold Touch Screen
0    5737
1      12
Name: count, dtype: int64
-----
Value count of feature_Retrograde
feature_Retrograde
0    5715
1      34
Name: count, dtype: int64
-----
Value count of feature_Day-Night Indicator
feature_Day-Night Indicator
0    5712
1      37
Name: count, dtype: int64
-----
Value count of feature_Perpetual Calendar
feature_Perpetual Calendar
0    5711
1      38
Name: count, dtype: int64
-

In [181]:
df['Luminosity'].unique()

array(['On Hands, Hour Markers', nan, 'On Hands, Hour Markers & Bezel',
       'On Hands', 'On Hands & Dial', 'On Moon Phase',
       'On Hands & Hour Markers', 'Hands, Hour Markers & On Dial',
       'On Dial', 'On Hands, Moon Phase', 'Full Luminova Dial',
       'On Hands, Hour makers, On Dial, & Bezel',
       'On Hands, Hour Markers & Case'], dtype=object)

In [186]:
luminosity_df = pd.DataFrame()
parts = ['Hands', 'Hour', 'Bezel', 'Dial']

for part in parts:
    luminosity_df[f"luminosity_on_{part}"] = df['Luminosity'].fillna('').apply(lambda x: 1 if part in x else 0)

In [187]:
for i in luminosity_df.columns:
    print("Value count of " + i)
    print(luminosity_df[i].value_counts())
    print("-----")

Value count of luminosity_on_Hands
luminosity_on_Hands
1    3421
0    2328
Name: count, dtype: int64
-----
Value count of luminosity_on_Hour
luminosity_on_Hour
1    2919
0    2830
Name: count, dtype: int64
-----
Value count of luminosity_on_Bezel
luminosity_on_Bezel
0    4944
1     805
Name: count, dtype: int64
-----
Value count of luminosity_on_Dial
luminosity_on_Dial
0    5686
1      63
Name: count, dtype: int64
-----


In [189]:
df = df.drop(columns=['Luminosity', 'Features', 'Precious Stone'])

In [191]:
df.columns

Index(['price', 'Brand', 'Collection', 'Series', 'Movement', 'Calibre',
       'Case Size', 'Case Thickness', 'Case Shape', 'Case Material',
       'Glass Material', 'Dial Colour', 'Hands', 'Indexes', 'Strap Material',
       'Strap Colour', 'Clasp Type', 'Buckle/Clasp Material', 'Gender',
       'Water Resistance (M)', 'Warranty Period', 'Country of Origin',
       'Display', 'Power Reserve', 'Jewels', 'Interchangeable Strap', 'Bezel',
       'Limited Edition', 'Frequency (bph)'],
      dtype='object')