In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier


In [7]:
# Set path to CSV and read in CSV
csv_path = Path('Resources/diamonds.csv')
df=pd.read_csv(csv_path)
df = df.drop(columns='Unnamed: 0')
cut_num = {
    'Ideal': 1,
    'Premium': 2,
    'Good': 3,
    'Very Good': 4,
    'Fair': 5,
}
df["cut_num"] = df["cut"].apply(lambda x: cut_num[x])
df['p_bin'] = pd.qcut(df['price'], q=10, precision=0)

le = preprocessing.LabelEncoder()
le.fit(df['p_bin'])
df['p_bin_num'] = le.transform(df['p_bin'])
df.drop(['cut','price','p_bin'], axis=1, inplace=True)
df_enc = pd.get_dummies(df, columns=['color','clarity'])

X = df_enc.drop(columns=['p_bin_num'])
y = df_enc.p_bin_num

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)
scaler = StandardScaler()

X_scaler = scaler.fit(X)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


# Decision Tree

model = tree.DecisionTreeClassifier()
model = model.fit(X_train_scaled, y_train)
predictions = model.predict(X_test_scaled)


# Random Forrest Classifier

model2 = RandomForestClassifier(n_estimators=500, random_state=7)
model2 = model2.fit(X_train_scaled, y_train)
predictions2 = model2.predict(X_test_scaled)


# Gradient Boost

model3 = GradientBoostingClassifier(    n_estimators=500,
    learning_rate=0.5,
    max_features=5,
    max_depth=3,
    random_state=7)

model3 = model3.fit(X_train_scaled, y_train)
predictions3 = model3.predict(X_test_scaled)

In [8]:
# Calculate the ROC curve and AUC for the testing set for Tree
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, predictions)

auc_test = auc(fpr_test, tpr_test)
auc_test = round(auc_test, 4)


fpr_test_rf, tpr_test_rf, thresholds_test_rf = roc_curve(y_test, predictions2)

auc_test_rf = auc(fpr_test_rf, tpr_test_rf)
auc_test_rf = round(auc_test_rf, 4)



fpr_test_gb, tpr_test_gb, thresholds_test_gb = roc_curve(y_test, predictions3)

auc_test_gb = auc(fpr_test_gb, tpr_test_gb)
auc_test_gb = round(auc_test_gb, 4)



# Create a DataFrame with the fpr and tpr results
roc_df_tree = pd.DataFrame({"FPR Train": fpr_train, "TPR Train": tpr_train,})

roc_df_rf = pd.DataFrame({"FPR Test": fpr_test_rf, "TPR Test": tpr_test,})
roc_df_gb = pd.DataFrame({"FPR Test": fpr_test_gb, "TPR Test": tpr_test,})




# Plotting the ROC Curves
roc_df_tree.plot(
    x="FPR Train",
    y="TPR Train",
    xlim=([-0.05, 1.05]),
    title=f"Train ROC Curve (AUC={auc_test})",
)

roc_df_rf.plot(
    x="FPR Test",
    y="TPR Test",
    color="red",
    style="--",
    xlim=([-0.05, 1.05]),
    title=f"Test ROC Curve (AUC={auc_test_rf})",
)

roc_df_gb.plot(
    x="FPR Test",
    y="TPR Test",
    color="red",
    style="--",
    xlim=([-0.05, 1.05]),
    title=f"Test ROC Curve (AUC={auc_test_gb})",
)

ValueError: multiclass format is not supported