In [1]:
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [3]:
df = sns.load_dataset("diamonds")

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [7]:
df.shape

(53940, 10)

In [10]:
df_small = df.iloc[:2000,:]

In [11]:
df_small.shape

(2000, 10)

In [13]:
df_small.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.711885,61.783,57.5896,2626.338,5.67144,5.67005,3.50339
std,0.185737,1.644842,2.36414,786.994922,0.57645,0.56495,0.36325
min,0.2,53.0,51.0,326.0,3.79,3.75,2.27
25%,0.7,61.0,56.0,2804.0,5.66,5.68,3.48
50%,0.72,61.8,57.0,2878.0,5.78,5.79,3.56
75%,0.8,62.6,59.0,2982.0,5.95,5.95,3.6725
max,1.5,69.5,70.0,3099.0,7.26,7.09,4.7


In [15]:
import numpy as np
df_small.describe(exclude=np.number)

Unnamed: 0,cut,color,clarity
count,2000,2000,2000
unique,5,7,8
top,Ideal,E,SI1
freq,731,482,543


In [16]:
X = df_small.drop("cut", axis=1)
y = df_small["cut"]

In [20]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
   X, y, test_size=0.2, random_state=42
)

In [21]:
# Define categorical and numerical features
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

numerical_features = X.select_dtypes( include=["float64", "int64"]).columns.tolist()

In [22]:
categorical_features

[]

In [23]:
preprocessor = ColumnTransformer(transformers=[("cat", OneHotEncoder(), categorical_features),("num", StandardScaler(), numerical_features),])

In [24]:
pipeline = Pipeline(
   [
       ("preprocessor", preprocessor),
       ("classifier", GradientBoostingClassifier(random_state=42)),
   ]
)

In [25]:
# Perform 5-fold cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5)

# Fit the model on the training data
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)

In [26]:
print(f"Mean Cross-Validation Accuracy: {cv_scores.mean():.4f}")
print("\nClassification Report:")
print(report)

Mean Cross-Validation Accuracy: 0.7569

Classification Report:
              precision    recall  f1-score   support

        Fair       0.95      0.88      0.91        24
        Good       0.59      0.54      0.57        35
       Ideal       0.75      0.89      0.81       134
     Premium       0.74      0.72      0.73       101
   Very Good       0.67      0.57      0.62       106

    accuracy                           0.73       400
   macro avg       0.74      0.72      0.73       400
weighted avg       0.73      0.73      0.72       400

