In [1]:
import pandas as pd
import numpy as np

#### Importing predict CSV

In [2]:
diamonds_predict = pd.read_csv("data/predict.csv")

In [9]:
diamonds_predict.shape

(13485, 10)

In [3]:
diamonds_predict.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,2.36,Ideal,I,SI2,60.8,54.0,8.68,8.57,5.24
1,1,2.04,Ideal,H,SI2,62.0,56.0,8.18,8.23,5.09
2,2,0.51,Ideal,I,SI1,61.7,54.0,5.18,5.19,3.2
3,3,0.3,Ideal,I,SI1,61.3,56.0,4.32,4.33,2.65
4,4,0.96,Fair,H,VS2,68.8,56.0,6.11,5.98,4.16


In [4]:
diamonds_predict.describe()

Unnamed: 0,id,carat,depth,table,x,y,z
count,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0,13485.0
mean,6742.0,0.800309,61.757783,57.468053,5.736456,5.738452,3.542003
std,3892.928525,0.477423,1.437478,2.231002,1.123217,1.114912,0.69401
min,0.0,0.2,44.0,51.0,0.0,0.0,0.0
25%,3371.0,0.4,61.0,56.0,4.72,4.73,2.92
50%,6742.0,0.7,61.8,57.0,5.7,5.71,3.53
75%,10113.0,1.04,62.5,59.0,6.53,6.53,4.03
max,13484.0,5.01,73.6,79.0,10.74,10.54,6.98


### Features
price: price in USD

carat: weight of the diamond

cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)

color: diamond colour, from J (worst) to D (best)

clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

x: length in mm

y: width in mm

z: depth in mm

depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

table: width of top of diamond relative to widest point (43--95)

In [5]:
diamonds_predict.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
dtype: int64

In [6]:
#No empty cells

In [7]:
diamonds_predict.dtypes

id           int64
carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object

#### From the training data we know the possibilities for each feature are the same. Now let's change the object columns to numerical with the same function used in train

In [10]:
def toNumber(x):
    for key,value in dictionary.items():
        if x==key:
            x=value
            return x
    return x

### For Cut

In [11]:
dictionary = {
    "Ideal":1, 
    "Premium":2, 
    "Very Good":3, 
    "Good":4, 
    "Fair":5
}

In [12]:
diamonds_predict.cut = diamonds_predict.cut.apply(toNumber)
diamonds_predict.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,2.36,1,I,SI2,60.8,54.0,8.68,8.57,5.24
1,1,2.04,1,H,SI2,62.0,56.0,8.18,8.23,5.09
2,2,0.51,1,I,SI1,61.7,54.0,5.18,5.19,3.2
3,3,0.3,1,I,SI1,61.3,56.0,4.32,4.33,2.65
4,4,0.96,5,H,VS2,68.8,56.0,6.11,5.98,4.16


### For Color

In [13]:
dictionary = {
    "D":1, 
    "E":2, 
    "F":3, 
    "G":4, 
    "H":5, 
    "I":6, 
    "J":7
}

In [14]:
diamonds_predict.color = diamonds_predict.color.apply(toNumber)
diamonds_predict.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,2.36,1,6,SI2,60.8,54.0,8.68,8.57,5.24
1,1,2.04,1,5,SI2,62.0,56.0,8.18,8.23,5.09
2,2,0.51,1,6,SI1,61.7,54.0,5.18,5.19,3.2
3,3,0.3,1,6,SI1,61.3,56.0,4.32,4.33,2.65
4,4,0.96,5,5,VS2,68.8,56.0,6.11,5.98,4.16


### For Clarity

In [15]:
dictionary = {
    "IF":1, 
    "VVS1":2, 
    "VVS2":3, 
    "VS1":4, 
    "VS2":5, 
    "SI1":6, 
    "SI2":7, 
    "I1":8
}

In [16]:
diamonds_predict.clarity = diamonds_predict.clarity.apply(toNumber)
diamonds_predict.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,2.36,1,6,7,60.8,54.0,8.68,8.57,5.24
1,1,2.04,1,5,7,62.0,56.0,8.18,8.23,5.09
2,2,0.51,1,6,6,61.7,54.0,5.18,5.19,3.2
3,3,0.3,1,6,6,61.3,56.0,4.32,4.33,2.65
4,4,0.96,5,5,5,68.8,56.0,6.11,5.98,4.16


#### Now that we have only numerical data let's export to a CSV file

In [17]:
diamonds_predict.to_csv('clean_data/clean_predict.csv')

Same as before I will export one CSV without the X, Y, Z columns to try modelling without those features to see how it goes

In [18]:
diamonds_predict.drop(['x','y','z'],axis=1,inplace=True)

In [19]:
diamonds_predict.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table
0,0,2.36,1,6,7,60.8,54.0
1,1,2.04,1,5,7,62.0,56.0
2,2,0.51,1,6,6,61.7,54.0
3,3,0.3,1,6,6,61.3,56.0
4,4,0.96,5,5,5,68.8,56.0


In [20]:
diamonds_predict.to_csv('clean_data/clean_predict_final.csv')