### Examinate Train Data

In [1]:
import pandas as pd

In [21]:
train_df = pd.read_csv("inputs/train.csv")
train_df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Very Good,F,VS2,62.8,56.0,4.29,4.31,2.7,605
1,1,0.34,Ideal,E,SI1,62.6,55.0,4.46,4.49,2.8,565
2,2,0.4,Very Good,D,SI1,60.3,62.0,4.7,4.75,2.85,720
3,3,0.4,Premium,H,VS1,61.8,59.2,4.72,4.74,2.92,793
4,4,0.9,Very Good,D,SI1,61.0,63.0,6.1,6.13,3.73,4381


In [9]:
print(train_df.shape)
print(train_df.dtypes)

(40455, 11)
id           int64
carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object


In [12]:
train_df.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [14]:
print(train_df.cut.unique())
print(train_df.color.unique())
print(train_df.clarity.unique())

['Very Good' 'Ideal' 'Premium' 'Good' 'Fair']
['F' 'E' 'D' 'H' 'G' 'J' 'I']
['VS2' 'SI1' 'VS1' 'SI2' 'VVS2' 'VVS1' 'IF' 'I1']


In [20]:
train_df_dummies = pd.get_dummies(train_df)
print(train_df_dummies.shape)
train_df_dummies.head()

(40455, 28)


Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_Fair,cut_Good,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,0.3,62.8,56.0,4.29,4.31,2.7,605,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0.34,62.6,55.0,4.46,4.49,2.8,565,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2,0.4,60.3,62.0,4.7,4.75,2.85,720,0,0,...,0,0,0,0,1,0,0,0,0,0
3,3,0.4,61.8,59.2,4.72,4.74,2.92,793,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4,0.9,61.0,63.0,6.1,6.13,3.73,4381,0,0,...,0,0,0,0,1,0,0,0,0,0


# 1. Enumerate cleaning

## * Clean Train Data *

### Change categorical to numeric variables

In [46]:
cut_list = train_df.cut.unique().tolist()
color_list = train_df.cut.unique().tolist()

In [47]:
for e,i in enumerate(cut_list):
    train_df.cut = train_df.cut.replace(i,e)

In [48]:
for e,i in enumerate(color_list):
    train_df.color = train_df.color.replace(i,e)

In [51]:
#From worst to best
clarity_list = ["I1","SI2","SI1","VS2","VS1","VVS2","VVS1","IF"]
for e,i in enumerate(clarity_list):
    train_df.clarity = train_df.clarity.replace(i,e)

In [53]:
print(train_df.cut.unique())
print(train_df.color.unique())
print(train_df.clarity.unique())
print(train_df.dtypes)

[0 1 2 3 4]
[0 1 2 3 4 5 6]
[3 2 4 1 5 6 7 0]
id           int64
carat      float64
cut          int64
color        int64
clarity      int64
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object


In [64]:
train_df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,0,0,3,62.8,56.0,4.29,4.31,2.7,605
1,1,0.34,1,1,2,62.6,55.0,4.46,4.49,2.8,565
2,2,0.4,0,2,2,60.3,62.0,4.7,4.75,2.85,720
3,3,0.4,2,3,4,61.8,59.2,4.72,4.74,2.92,793
4,4,0.9,0,2,2,61.0,63.0,6.1,6.13,3.73,4381


### Save file in *Inputs* folder

In [54]:
train_df.to_csv("inputs/cleantrain.csv", header=True, index=False)

## * Clean Predict Data *

In [59]:
predict_df = pd.read_csv("inputs/predict.csv")
predict_df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,2.36,Ideal,I,SI2,60.8,54.0,8.68,8.57,5.24
1,1,2.04,Ideal,H,SI2,62.0,56.0,8.18,8.23,5.09
2,2,0.51,Ideal,I,SI1,61.7,54.0,5.18,5.19,3.2
3,3,0.3,Ideal,I,SI1,61.3,56.0,4.32,4.33,2.65
4,4,0.96,Fair,H,VS2,68.8,56.0,6.11,5.98,4.16


In [60]:
cut_list = predict_df.cut.unique().tolist()
color_list = predict_df.cut.unique().tolist()
clarity_list = ["I1","SI2","SI1","VS2","VS1","VVS2","VVS1","IF"]

In [61]:
for e,i in enumerate(cut_list):
    predict_df.cut = train_df.cut.replace(i,e)
for e,i in enumerate(color_list):
    predict_df.color = train_df.color.replace(i,e)
for e,i in enumerate(clarity_list):
    predict_df.clarity = train_df.clarity.replace(i,e)

In [62]:
print(predict_df.cut.unique())
print(predict_df.color.unique())
print(predict_df.clarity.unique())
print(predict_df.dtypes)

[0 1 2 3 4]
[0 1 2 3 4 5 6]
[3 2 4 1 5 6 7 0]
id           int64
carat      float64
cut          int64
color        int64
clarity      int64
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object


In [63]:
predict_df.to_csv("inputs/cleanpredict.csv", header=True, index=False)