In [26]:
import numpy as np
import pandas as pd
import matplotlib as mpl
diamonds = pd.read_csv('diamonds.csv')


In [27]:
def outliers(var):
    a = []
    q1 = diamonds[var].quantile(.25)
    q2 = diamonds[var].quantile(.5)
    q3 = diamonds[var].quantile(.75)
    iqr = q3-q1
    ulim = float(q3+(1.5*iqr))
    llim = float(q1-(1.5*iqr))

    for i in diamonds[var]:
        if i > ulim:
            i = np.NaN
        elif i < llim:
            i = np.NaN
        else:
            i=i
        a.append(i)
    return a

for col in diamonds.select_dtypes(exclude='object').columns:
    diamonds[col] = outliers(col)

In [28]:
for i in diamonds.select_dtypes(exclude='object').columns:
    diamonds[i] = diamonds[i].fillna(diamonds[i].mean())

In [29]:
diamonds['volume'] = diamonds['x']*diamonds['y']*diamonds['z']
diamonds.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,1,0.23,Ideal,E,SI2,61.5,55.0,326.0,3.95,3.98,2.43,38.20203
1,2,0.21,Premium,E,SI1,59.8,61.0,326.0,3.89,3.84,2.31,34.505856
2,3,0.23,Good,E,VS1,61.776373,57.373404,327.0,4.05,4.07,2.31,38.076885
3,4,0.29,Premium,I,VS2,62.4,58.0,334.0,4.2,4.23,2.63,46.72458
4,5,0.31,Good,J,SI2,63.3,58.0,335.0,4.34,4.35,2.75,51.91725


In [30]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 

diamonds['cut'] = label_encoder.fit_transform(diamonds['cut'])
diamonds['color'] = label_encoder.fit_transform(diamonds['color'])
diamonds['clarity'] = label_encoder.fit_transform(diamonds['clarity'])
diamonds.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,x,y,z,volume
0,1,0.23,2,1,3,61.5,55.0,326.0,3.95,3.98,2.43,38.20203
1,2,0.21,3,1,2,59.8,61.0,326.0,3.89,3.84,2.31,34.505856
2,3,0.23,1,1,4,61.776373,57.373404,327.0,4.05,4.07,2.31,38.076885
3,4,0.29,3,5,5,62.4,58.0,334.0,4.2,4.23,2.63,46.72458
4,5,0.31,1,6,3,63.3,58.0,335.0,4.34,4.35,2.75,51.91725


In [31]:
df1 = pd.DataFrame(diamonds,columns= ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'volume'])

X = df1
print(df1.dtypes)
y = diamonds['price']

carat      float64
cut          int32
color        int32
clarity      int32
depth      float64
table      float64
x          float64
y          float64
z          float64
volume     float64
dtype: object


In [32]:
class Node:

    def __init__(self, x, y, idxs, min_leaf=5):
        self.x = x 
        self.y = y
        self.idxs = idxs 
        self.min_leaf = min_leaf
        self.row_count = len(idxs)
        self.col_count = x.shape[1]
        self.val = np.mean(y[idxs])
        self.score = float('inf')
        self.find_varsplit()
        
    def find_varsplit(self):
        for c in range(self.col_count): self.find_better_split(c)
        if self.is_leaf: return
        x = self.split_col
        lhs = np.nonzero(x <= self.split)[0]
        rhs = np.nonzero(x > self.split)[0]
        self.lhs = Node(self.x, self.y, self.idxs[lhs], self.min_leaf)
        self.rhs = Node(self.x, self.y, self.idxs[rhs], self.min_leaf)
        
    def find_better_split(self, var_idx):
      
        x = self.x.values[self.idxs, var_idx]
        for r in range(self.row_count):
            lhs = x <= x[r]
            rhs = x > x[r]
            if rhs.sum() < self.min_leaf or lhs.sum() < self.min_leaf: continue

            curr_score = self.find_score(lhs, rhs)
            if curr_score < self.score: 
                self.var_idx = var_idx
                self.score = curr_score
                self.split = x[r]
                
    def find_score(self, lhs, rhs):
        y = self.y[self.idxs]
        lhs_std = y[lhs].std()
        rhs_std = y[rhs].std()
        return lhs_std * lhs.sum() + rhs_std * rhs.sum()
                
   
    def split_col(self): return self.x.values[self.idxs,self.var_idx]
                
    
    def is_leaf(self): return self.score == float('inf')                

    def predict(self, x):
        return np.array([self.predict_row(xi) for xi in x])

    def predict_row(self, xi):
        if self.is_leaf: return self.val
        node = self.lhs if xi[self.var_idx] <= self.split else self.rhs
        return node.predict_row(xi)


In [33]:
class DecisionTreeRegressor:
  
  def fit(self, X, y, min_leaf = 5):
    self.dtree = Node(X, y, np.array(np.arange(len(y))), min_leaf)
    return self
  
  def predict(self, X):
    return self.dtree.predict(X.values)
    

In [34]:
regressor = DecisionTreeRegressor().fit(X, y)
preds = regressor.predict(X)


In [35]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(h, y):
  return sqrt(mean_squared_error(h, y))

In [36]:
import sklearn.metrics as metrics
metrics.r2_score(y, preds)

0.9555492557388178

In [37]:

rmse(preds, y)

563.4340242956168

In [40]:
dict = {'carat': [diamonds.carat[6506]], 'cut': [diamonds.cut[6506]], 'color': [diamonds.color[6506]], 
        'clarity': [diamonds.clarity[6506]], 'depth': [diamonds.depth[6506]], 'table': [diamonds.table[6506]], 
        'x': [diamonds.x[6506]], 'y': [diamonds.y[6506]], 'z': [diamonds.z[6506]], 'volume': [diamonds.volume[6506]]}
sample = pd.DataFrame(dict, columns= ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'volume'])
prediction = regressor.predict(sample) #4064
print(prediction)

[4270.]


In [41]:
dict2 = {'carat': [diamonds.carat[41715]], 'cut': [diamonds.cut[41715]], 'color': [diamonds.color[41715]], 
        'clarity': [diamonds.clarity[41715]], 'depth': [diamonds.depth[41715]], 'table': [diamonds.table[41715]], 
        'x': [diamonds.x[41715]], 'y': [diamonds.y[41715]], 'z': [diamonds.z[41715]], 'volume': [diamonds.volume[41715]]}
sample = pd.DataFrame(dict2, columns= ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'volume'])
prediction = regressor.predict(sample) #1243
print(prediction)

[1293.25]


In [44]:
dict3 = {'carat': [diamonds.carat[53810]], 'cut': [diamonds.cut[53810]], 'color': [diamonds.color[53810]], 
        'clarity': [diamonds.clarity[53810]], 'depth': [diamonds.depth[53810]], 'table': [diamonds.table[53810]], 
        'x': [diamonds.x[53810]], 'y': [diamonds.y[53810]], 'z': [diamonds.z[53810]], 'volume': [diamonds.volume[53810]]}
sample = pd.DataFrame(dict3, columns= ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'volume'])
prediction = regressor.predict(sample) #2733
print(prediction)

[2561.6]


In [47]:
dict4 = {'carat': [diamonds.carat[10]], 'cut': [diamonds.cut[10]], 'color': [diamonds.color[10]], 
        'clarity': [diamonds.clarity[10]], 'depth': [diamonds.depth[10]], 'table': [diamonds.table[10]], 
        'x': [diamonds.x[10]], 'y': [diamonds.y[10]], 'z': [diamonds.z[10]], 'volume': [diamonds.volume[10]]}
sample = pd.DataFrame(dict4, columns= ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'volume'])
prediction = regressor.predict(sample) #338
print(prediction)

[355.66666667]


In [52]:
dict5 = {'carat': [diamonds.carat[2000]], 'cut': [diamonds.cut[2000]], 'color': [diamonds.color[2000]], 
        'clarity': [diamonds.clarity[2000]], 'depth': [diamonds.depth[2000]], 'table': [diamonds.table[2000]], 
        'x': [diamonds.x[2000]], 'y': [diamonds.y[2000]], 'z': [diamonds.z[2000]], 'volume': [diamonds.volume[2000]]}
sample = pd.DataFrame(dict5, columns= ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'volume'])
prediction = regressor.predict(sample) #3099
print(prediction)

[3262.42857143]


In [53]:
dict6 = {'carat': [diamonds.carat[5000]], 'cut': [diamonds.cut[5000]], 'color': [diamonds.color[5000]], 
        'clarity': [diamonds.clarity[5000]], 'depth': [diamonds.depth[5000]], 'table': [diamonds.table[5000]], 
        'x': [diamonds.x[5000]], 'y': [diamonds.y[5000]], 'z': [diamonds.z[5000]], 'volume': [diamonds.volume[5000]]}
sample = pd.DataFrame(dict6, columns= ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'volume'])
prediction = regressor.predict(sample) #3742
print(prediction)

[3725.66666667]


In [54]:
dict7 = {'carat': [diamonds.carat[7000]], 'cut': [diamonds.cut[7000]], 'color': [diamonds.color[7000]], 
        'clarity': [diamonds.clarity[7000]], 'depth': [diamonds.depth[7000]], 'table': [diamonds.table[7000]], 
        'x': [diamonds.x[7000]], 'y': [diamonds.y[7000]], 'z': [diamonds.z[7000]], 'volume': [diamonds.volume[7000]]}
sample = pd.DataFrame(dict7, columns= ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'volume'])
prediction = regressor.predict(sample) #4155
print(prediction)

[4285.16666667]


In [55]:
dict8 = {'carat': [diamonds.carat[10886]], 'cut': [diamonds.cut[10886]], 'color': [diamonds.color[10886]], 
        'clarity': [diamonds.clarity[10886]], 'depth': [diamonds.depth[10886]], 'table': [diamonds.table[10886]], 
        'x': [diamonds.x[10886]], 'y': [diamonds.y[10886]], 'z': [diamonds.z[10886]], 'volume': [diamonds.volume[10886]]}
sample = pd.DataFrame(dict8, columns= ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'volume'])
prediction = regressor.predict(sample) #4879
print(prediction)

[4914.4]
