In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.svm import OneClassSVM

In [2]:
train = pd.read_csv("data/train.csv")

In [3]:
predict = pd.read_csv("data/predict.csv")

In [4]:
#Functions: 
def encode(csv):
    cut = csv.cut
    color = csv.color
    clarity = csv.clarity

    le_cut = preprocessing.LabelEncoder()
    le_color = preprocessing.LabelEncoder()
    le_clarity = preprocessing.LabelEncoder()

    cut_numeric = le_cut.fit_transform(cut)
    color_numeric = le_color.fit_transform(color)
    clarity_numeric = le_clarity.fit_transform(clarity)
    clean = csv.copy()
    clean["cut_numeric"] = cut_numeric
    clean["color_numeric"] = color_numeric
    clean["clarity_numeric"] = clarity_numeric
    clean.drop(columns=["cut", "color", "clarity"], inplace = True)
    return clean

In [5]:
def printMetrics(y_test,y_pred):
    printMetric = lambda label,value:print(f"\t {label}: {round(value,3)}")
    printMetric("Accuracy",accuracy_score(y_test, y_pred))
    printMetric("Precision",precision_score(y_test, y_pred, average = "weighted"))
    printMetric("Recall",recall_score(y_test, y_pred, average = "weighted"))
    printMetric("F1Score",f1_score(y_test, y_pred, average = "weighted"))
    printMetric("Mean Square Error", mean_squared_error(y_test,y_pred))
    printMetric("Root Mean Square Error", mean_squared_error(y_test,y_pred, squared = False))

# Inspecting the Data

In [6]:
train.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Very Good,F,VS2,62.8,56.0,4.29,4.31,2.7,605
1,1,0.34,Ideal,E,SI1,62.6,55.0,4.46,4.49,2.8,565
2,2,0.4,Very Good,D,SI1,60.3,62.0,4.7,4.75,2.85,720
3,3,0.4,Premium,H,VS1,61.8,59.2,4.72,4.74,2.92,793
4,4,0.9,Very Good,D,SI1,61.0,63.0,6.1,6.13,3.73,4381


In [7]:
train.isna().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
price      0
dtype: int64

## Non numeric values
### Cut | Color | Clarity 

In [8]:
train.dtypes

id           int64
carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
price        int64
dtype: object

In [9]:
train.corr()

Unnamed: 0,id,carat,depth,table,x,y,z,price
id,1.0,0.002653,0.002349,-0.006281,0.001529,0.001976,0.00403,0.006826
carat,0.002653,1.0,0.032379,0.181791,0.974898,0.944513,0.947538,0.922567
depth,0.002349,0.032379,1.0,-0.297164,-0.020876,-0.025105,0.099375,-0.005105
table,-0.006281,0.181791,-0.297164,1.0,0.194763,0.181585,0.149242,0.129777
x,0.001529,0.974898,-0.020876,0.194763,1.0,0.967114,0.965114,0.884089
y,0.001976,0.944513,-0.025105,0.181585,0.967114,1.0,0.940678,0.858536
z,0.00403,0.947538,0.099375,0.149242,0.965114,0.940678,1.0,0.855923
price,0.006826,0.922567,-0.005105,0.129777,0.884089,0.858536,0.855923,1.0


In [10]:
print(train.color.value_counts(),"\n\n",
train.cut.value_counts(),"\n\n",
train.clarity.value_counts())

G    8469
E    7375
F    7179
H    6229
D    5012
I    4088
J    2103
Name: color, dtype: int64 

 Ideal        16175
Premium      10272
Very Good     9146
Good          3676
Fair          1186
Name: cut, dtype: int64 

 SI1     9813
VS2     9217
SI2     6844
VS1     6150
VVS2    3784
VVS1    2780
IF      1326
I1       541
Name: clarity, dtype: int64


In [11]:
from sklearn.model_selection import train_test_split

## Encoding...   
### Cut | Color | Clarity 

In [12]:
from sklearn import preprocessing

cut = train.cut
color = train.color
clarity = train.clarity

le_cut = preprocessing.LabelEncoder()
le_color = preprocessing.LabelEncoder()
le_clarity = preprocessing.LabelEncoder()

cut_numeric = le_cut.fit_transform(cut)
color_numeric = le_color.fit_transform(color)
clarity_numeric = le_clarity.fit_transform(clarity)

In [13]:
train_clean = train
train_clean.drop(columns=["cut", "color", "clarity"], inplace = True)

In [14]:
train_clean["cut_numeric"] = cut_numeric
train_clean["color_numeric"] = color_numeric
train_clean["clarity_numeric"] = clarity_numeric

In [15]:
train_clean.dtypes

id                   int64
carat              float64
depth              float64
table              float64
x                  float64
y                  float64
z                  float64
price                int64
cut_numeric          int64
color_numeric        int64
clarity_numeric      int64
dtype: object

In [16]:
train_clean.describe()

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_numeric,color_numeric,clarity_numeric
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,20227.0,0.79715,61.746612,57.453561,5.729391,5.733217,3.537644,3928.715264,2.556569,2.598294,3.839748
std,11678.496907,0.472872,1.431006,2.235668,1.121283,1.151076,0.709557,3985.070609,1.027918,1.698944,1.722592
min,0.0,0.2,43.0,43.0,0.0,0.0,0.0,326.0,0.0,0.0,0.0
25%,10113.5,0.4,61.0,56.0,4.71,4.72,2.91,946.5,2.0,1.0,2.0
50%,20227.0,0.7,61.8,57.0,5.7,5.71,3.53,2398.0,2.0,3.0,4.0
75%,30340.5,1.04,62.5,59.0,6.54,6.54,4.04,5328.5,3.0,4.0,5.0
max,40454.0,4.5,79.0,95.0,10.23,58.9,31.8,18818.0,4.0,6.0,7.0


# Attempt 1
## Tree . Extra Tree Classifier

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.tree import ExtraTreeClassifier

In [18]:
X = train_clean.loc[:, train_clean.columns != "price"]
y = train_clean["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(32364, 10) (8091, 10) (32364,) (8091,)


In [19]:
auto_model = ExtraTreeClassifier()  
auto_model.fit(X_train, y_train)

ExtraTreeClassifier()

In [20]:
y_pred = auto_model.predict(X_test)

In [21]:
results = pd.DataFrame(y_pred, columns=["Prediction"])
results["Ground Truth"] = list(y_test)

**The mean_squared_error function computes mean square error,   
a risk metric corresponding to the expected value of the squared   
(quadratic) error or loss**


In [22]:
printMetrics(y_test,y_pred)

	 Accuracy: 0.061
	 Precision: 0.073
	 Recall: 0.061
	 F1Score: 0.062
	 Mean Square Error: 1670750.988
	 Root Mean Square Error: 1292.575


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Prediction

In [23]:
#from encode import *
m_one = encode(predict)
m_one

Unnamed: 0,id,carat,depth,table,x,y,z,cut_numeric,color_numeric,clarity_numeric
0,0,2.36,60.8,54.0,8.68,8.57,5.24,2,5,3
1,1,2.04,62.0,56.0,8.18,8.23,5.09,2,4,3
2,2,0.51,61.7,54.0,5.18,5.19,3.20,2,5,2
3,3,0.30,61.3,56.0,4.32,4.33,2.65,2,5,2
4,4,0.96,68.8,56.0,6.11,5.98,4.16,0,4,5
...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.53,60.8,59.0,5.23,5.17,3.16,3,1,6
13481,13481,1.01,62.8,60.0,6.32,6.38,3.99,4,2,2
13482,13482,0.92,60.7,58.0,6.29,6.26,3.81,3,2,0
13483,13483,0.39,62.0,54.0,4.70,4.72,2.92,2,2,7


In [24]:
s_o = auto_model.predict(m_one)

In [25]:
s_o = pd.DataFrame(s_o, columns=["price"])

In [26]:
s_o.rename(index={'': 'id'}, inplace=True)


In [27]:
#s_o.to_csv("outputs/1.1")

# Cleaning the data

### Outliers

In [28]:
train.corr()

Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_numeric,color_numeric,clarity_numeric
id,1.0,0.002653,0.002349,-0.006281,0.001529,0.001976,0.00403,0.006826,0.001979,-0.007942,0.002959
carat,0.002653,1.0,0.032379,0.181791,0.974898,0.944513,0.947538,0.922567,0.020662,0.28915,-0.212821
depth,0.002349,0.032379,1.0,-0.297164,-0.020876,-0.025105,0.099375,-0.005105,-0.188766,0.04642,-0.054104
table,-0.006281,0.181791,-0.297164,1.0,0.194763,0.181585,0.149242,0.129777,0.149559,0.026387,-0.084703
x,0.001529,0.974898,-0.020876,0.194763,1.0,0.967114,0.965114,0.884089,0.025532,0.267955,-0.224169
y,0.001976,0.944513,-0.025105,0.181585,0.967114,1.0,0.940678,0.858536,0.030438,0.259464,-0.214578
z,0.00403,0.947538,0.099375,0.149242,0.965114,0.940678,1.0,0.855923,0.005945,0.263969,-0.221445
price,0.006826,0.922567,-0.005105,0.129777,0.884089,0.858536,0.855923,1.0,0.040733,0.171269,-0.072106
cut_numeric,0.001979,0.020662,-0.188766,0.149559,0.025532,0.030438,0.005945,0.040733,1.0,0.002828,0.025718
color_numeric,-0.007942,0.28915,0.04642,0.026387,0.267955,0.259464,0.263969,0.171269,0.002828,1.0,-0.025578


In [29]:
#getting rid of id, depth
features_1 = ['carat','table','x','y','z','price','color_numeric','cut_numeric', 'clarity_numeric']

#getting rid of id, depth, cut_numeric, clarity_numeric
features_2 = ['carat','table','x','y','z','price','color_numeric']

## Looping Models

In [36]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [31]:
models = {
    "decision_tree": DecisionTreeClassifier(random_state=0),
    "kneighbors": KNeighborsClassifier(n_neighbors=3),
}

In [None]:
make_pipeline(StandardScaler(), SVC(gamma='auto')).fit(X_train, y_train)

In [32]:
"""for name,m  in models.items():
    print(f"Training {name}...")
    name=m.fit(X_train, y_train)"""

'for name,m  in models.items():\n    print(f"Training {name}...")\n    name=m.fit(X_train, y_train)'

In [33]:
"""for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"Evaluating model {name}")
    printMetric("Accuracy",accuracy_score(y_test, y_pred))
    printMetric("Precision",precision_score(y_test, y_pred))
    printMetric("Recall",recall_score(y_test, y_pred))
    printMetric("F1Score",f1_score(y_test, y_pred))"""

'for name, model in models.items():\n    y_pred = model.predict(X_test)\n    print(f"Evaluating model {name}")\n    printMetric("Accuracy",accuracy_score(y_test, y_pred))\n    printMetric("Precision",precision_score(y_test, y_pred))\n    printMetric("Recall",recall_score(y_test, y_pred))\n    printMetric("F1Score",f1_score(y_test, y_pred))'