In [25]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [26]:
data = pd.read_csv('../../data/diam.csv')

In [27]:
data.head()


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,target
0,0.23,Ideal,E,SI2,61.5,55.0,3.95,3.98,2.43,dewevo
1,0.21,Premium,E,SI1,59.8,61.0,3.89,3.84,2.31,dewevo
2,0.23,Good,E,VS1,56.9,65.0,4.05,4.07,2.31,dewevo
3,0.29,Premium,I,VS2,62.4,58.0,4.2,4.23,2.63,dewevo
4,0.31,Good,J,SI2,63.3,58.0,4.34,4.35,2.75,dewevo


In [28]:
data['target'] = data['target'].map({'dewevo': 0, 'dorogo': 1})

In [29]:
data['color'].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [30]:
data.drop(['clarity'], axis=1, inplace=True)

In [31]:
data['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [32]:
# data['cut'] = data['cut'].map({'Fair': 0, 'Good': 1, 'Very Good': 2, 'Ideal': 3, 'Premium': 4})

In [33]:
data.head()

Unnamed: 0,carat,cut,color,depth,table,x,y,z,target
0,0.23,Ideal,E,61.5,55.0,3.95,3.98,2.43,0
1,0.21,Premium,E,59.8,61.0,3.89,3.84,2.31,0
2,0.23,Good,E,56.9,65.0,4.05,4.07,2.31,0
3,0.29,Premium,I,62.4,58.0,4.2,4.23,2.63,0
4,0.31,Good,J,63.3,58.0,4.34,4.35,2.75,0


In [34]:
data = pd.concat([data, pd.get_dummies(data['cut'], prefix="cut"),pd.get_dummies(data['color'], prefix="color")], axis=1)

In [35]:
data.head()
data.drop(['cut'], axis=1, inplace=True)
data.drop(['color'], axis=1, inplace=True)

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 19 columns):
carat            53940 non-null float64
depth            53940 non-null float64
table            53940 non-null float64
x                53940 non-null float64
y                53940 non-null float64
z                53940 non-null float64
target           53940 non-null int64
cut_Fair         53940 non-null uint8
cut_Good         53940 non-null uint8
cut_Ideal        53940 non-null uint8
cut_Premium      53940 non-null uint8
cut_Very Good    53940 non-null uint8
color_D          53940 non-null uint8
color_E          53940 non-null uint8
color_F          53940 non-null uint8
color_G          53940 non-null uint8
color_H          53940 non-null uint8
color_I          53940 non-null uint8
color_J          53940 non-null uint8
dtypes: float64(6), int64(1), uint8(12)
memory usage: 3.5 MB


In [37]:
data.head()

Unnamed: 0,carat,depth,table,x,y,z,target,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J
0,0.23,61.5,55.0,3.95,3.98,2.43,0,0,0,1,0,0,0,1,0,0,0,0,0
1,0.21,59.8,61.0,3.89,3.84,2.31,0,0,0,0,1,0,0,1,0,0,0,0,0
2,0.23,56.9,65.0,4.05,4.07,2.31,0,0,1,0,0,0,0,1,0,0,0,0,0
3,0.29,62.4,58.0,4.2,4.23,2.63,0,0,0,0,1,0,0,0,0,0,0,1,0
4,0.31,63.3,58.0,4.34,4.35,2.75,0,0,1,0,0,0,0,0,0,0,0,0,1


In [38]:
y = data['target']

In [39]:
X = data.drop('target', axis=1)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

In [41]:
first_tree = DecisionTreeClassifier(random_state=17)

In [42]:
np.mean(cross_val_score(first_tree, X_train, y_train, cv=5))

0.9370729599281626

In [44]:
tree_params = {'max_depth': np.arange(1, 11), 'max_features':[.5, .7, 1]}

In [45]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1)

In [46]:
tree_grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), 'max_features': [0.5, 0.7, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [47]:
tree_grid.best_score_, tree_grid.best_params_

(0.9575983897452196, {'max_depth': 7, 'max_features': 0.7})

In [48]:
tree_test_pred = tree_grid.predict(X_test)

In [49]:
accuracy_score(y_test, tree_test_pred)

0.9596465208256088