In [1]:
import pandas as pd
import numpy as np

#### Reading CSV

In [2]:
diamonds_train = pd.read_csv("data/train.csv")

In [3]:

diamonds_train.shape

(40455, 11)

### Features
price: price in USD

carat: weight of the diamond

cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)

color: diamond colour, from J (worst) to D (best)

clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

x: length in mm

y: width in mm

z: depth in mm

depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

table: width of top of diamond relative to widest point (43--95)

In [4]:
diamonds_train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Very Good,F,VS2,62.8,56.0,4.29,4.31,2.7,605
1,1,0.34,Ideal,E,SI1,62.6,55.0,4.46,4.49,2.8,565
2,2,0.4,Very Good,D,SI1,60.3,62.0,4.7,4.75,2.85,720
3,3,0.4,Premium,H,VS1,61.8,59.2,4.72,4.74,2.92,793
4,4,0.9,Very Good,D,SI1,61.0,63.0,6.1,6.13,3.73,4381


In [5]:
diamonds_train.describe()

Unnamed: 0,id,carat,depth,table,x,y,z,price
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,20227.0,0.79715,61.746612,57.453561,5.729391,5.733217,3.537644,3928.715264
std,11678.496907,0.472872,1.431006,2.235668,1.121283,1.151076,0.709557,3985.070609
min,0.0,0.2,43.0,43.0,0.0,0.0,0.0,326.0
25%,10113.5,0.4,61.0,56.0,4.71,4.72,2.91,946.5
50%,20227.0,0.7,61.8,57.0,5.7,5.71,3.53,2398.0
75%,30340.5,1.04,62.5,59.0,6.54,6.54,4.04,5328.5
max,40454.0,4.5,79.0,95.0,10.23,58.9,31.8,18818.0


Checking For Nan Values

In [6]:
diamonds_train.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

In [7]:
#There are no missing values

In [8]:
diamonds_train.dtypes

id           int64
carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object

In [9]:
diamonds_train.color.value_counts()

G    8469
E    7375
F    7179
H    6229
D    5012
I    4088
J    2103
Name: color, dtype: int64

In [10]:
diamonds_train.cut.value_counts()

Ideal        16175
Premium      10272
Very Good     9146
Good          3676
Fair          1186
Name: cut, dtype: int64

In [11]:
diamonds_train.clarity.value_counts()

SI1     9813
VS2     9217
SI2     6844
VS1     6150
VVS2    3784
VVS1    2780
IF      1326
I1       541
Name: clarity, dtype: int64

Now let's create a function to assign numbers to categorical values stored as objects. We can do this since we know that there not many, well defined possibilities for each feature

In [12]:
def toNumber(x):
    for key,value in dictionary.items():
        if x==key:
            x=value
            return x
    return x

### For Cut

In [13]:
dictionary = {
    "Ideal":1, 
    "Premium":2, 
    "Very Good":3, 
    "Good":4, 
    "Fair":5
}

In [14]:
diamonds_train.cut = diamonds_train.cut.apply(toNumber)
diamonds_train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,3,F,VS2,62.8,56.0,4.29,4.31,2.7,605
1,1,0.34,1,E,SI1,62.6,55.0,4.46,4.49,2.8,565
2,2,0.4,3,D,SI1,60.3,62.0,4.7,4.75,2.85,720
3,3,0.4,2,H,VS1,61.8,59.2,4.72,4.74,2.92,793
4,4,0.9,3,D,SI1,61.0,63.0,6.1,6.13,3.73,4381


### For Color

In [15]:
dictionary = {
    "D":1, 
    "E":2, 
    "F":3, 
    "G":4, 
    "H":5, 
    "I":6, 
    "J":7
}

In [16]:
diamonds_train.color = diamonds_train.color.apply(toNumber)
diamonds_train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,3,3,VS2,62.8,56.0,4.29,4.31,2.7,605
1,1,0.34,1,2,SI1,62.6,55.0,4.46,4.49,2.8,565
2,2,0.4,3,1,SI1,60.3,62.0,4.7,4.75,2.85,720
3,3,0.4,2,5,VS1,61.8,59.2,4.72,4.74,2.92,793
4,4,0.9,3,1,SI1,61.0,63.0,6.1,6.13,3.73,4381


### For Clarity

In [17]:
dictionary = {
    "IF":1, 
    "VVS1":2, 
    "VVS2":3, 
    "VS1":4, 
    "VS2":5, 
    "SI1":6, 
    "SI2":7, 
    "I1":8
}

In [18]:
diamonds_train.clarity = diamonds_train.clarity.apply(toNumber)
diamonds_train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,3,3,5,62.8,56.0,4.29,4.31,2.7,605
1,1,0.34,1,2,6,62.6,55.0,4.46,4.49,2.8,565
2,2,0.4,3,1,6,60.3,62.0,4.7,4.75,2.85,720
3,3,0.4,2,5,4,61.8,59.2,4.72,4.74,2.92,793
4,4,0.9,3,1,6,61.0,63.0,6.1,6.13,3.73,4381


Now we have only numerical numbers

In [19]:
## Let's Export the Clean dataset

In [21]:
diamonds_train.to_csv('clean_data/clean_training.csv')

#### Will export another CSV withouth the X, Y, Z columns since I want to try modelling without the measures, since other features can describe the same 

In [24]:
diamonds_train.drop(['x','y','z'],axis=1,inplace=True)

In [25]:
diamonds_train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price
0,0,0.3,3,3,5,62.8,56.0,605
1,1,0.34,1,2,6,62.6,55.0,565
2,2,0.4,3,1,6,60.3,62.0,720
3,3,0.4,2,5,4,61.8,59.2,793
4,4,0.9,3,1,6,61.0,63.0,4381


In [26]:
diamonds_train.to_csv('clean_data/clean_train_final.csv')