In [113]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [114]:
data = pd.read_csv('../ods/data/diam3.csv')

In [115]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
carat      53940 non-null float64
cut        53940 non-null object
color      53940 non-null object
clarity    53940 non-null object
depth      53940 non-null float64
table      53940 non-null float64
x          53940 non-null float64
y          53940 non-null float64
z          53940 non-null float64
target     53940 non-null object
dtypes: float64(6), object(4)
memory usage: 4.1+ MB


In [116]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,target
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,dewevo
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,dewevo
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,dewevo
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,dewevo
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,dewevo


In [117]:
data.tail()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,target
53935,0.72,Ideal,D,SI1,60.8,57.0,5.75,5.76,3.5,dewevo
53936,0.72,Good,D,SI1,63.1,55.0,5.69,5.75,3.61,dewevo
53937,0.7,Very Good,D,SI1,62.8,60.0,5.66,5.68,3.56,dewevo
53938,0.86,Premium,H,SI2,61.0,58.0,6.15,6.12,3.74,dewevo
53939,0.75,Ideal,D,SI2,62.2,55.0,5.83,5.87,3.64,dewevo


In [118]:
data['target'].value_counts()

dewevo    34283
dorogo    19657
Name: target, dtype: int64

In [119]:
data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,target
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,dewevo
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,dewevo
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,dewevo
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,dewevo
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,dewevo


In [120]:
data.drop(['color', 'clarity'], 
              axis=1, inplace=True)

In [121]:
data['cut'].value_counts()

Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: cut, dtype: int64

In [122]:
data['cut'] = data['cut'].map({'Ideal': 1, 'Premium': 2, 'Very Good': 3, 'Good':4, 'Fair': 5})

In [123]:
data['target'] = data['target'].map({'dorogo': 1, 'dewevo': 0})

In [124]:
y = data['target'].astype('int')

In [125]:
X = data.drop('target', axis=1)

In [126]:
X.shape, y.shape

((53940, 7), (53940,))

In [127]:
from sklearn.model_selection import train_test_split, cross_val_score

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                      test_size=0.3, 
                                                      random_state=17)

In [129]:
X_train.shape, X_test.shape

((37758, 7), (16182, 7))

In [130]:
first_tree = DecisionTreeClassifier(random_state=17)

In [131]:
np.mean(cross_val_score(first_tree, X_train, y_train, cv=5))

0.9310079554383208

In [132]:
tree_params = {'max_depth': np.arange(1, 11)}

In [133]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1)

In [134]:
tree_grid.fit(X_train, y_train);

In [135]:
tree_grid.best_score_, tree_grid.best_params_

(0.9535197838868584, {'max_depth': 3})

In [136]:
tree_test_pred = tree_grid.predict(X_test)

In [137]:
accuracy_score(y_test, tree_test_pred)

0.9535286120380669