In [None]:
import pandas as pd
import seaborn as sea
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle

In [None]:
# Dataframe
df=pd.read_csv("/content/DiamondsPrice.csv")
df.shape

(53940, 10)

In [None]:
#Dropping the outliers.
df = df[(df["depth"]<75)&(df["depth"]>50)]
df = df[(df["table"]<80)&(df["table"]>45)]
df = df[(df["x"]>3)]
df = df[(df["y"]<20)]
df = df[(df["z"]<10)]
df.shape

(53920, 10)

In [None]:
# Create a label encoder object using scikit-learn.
#A tool that helps us convert categorical data into numerical format
le = LabelEncoder()
df['cut']=le.fit_transform(df['cut'])
df['color']=le.fit_transform(df['color'])
df['clarity']=le.fit_transform(df['clarity'])

In [None]:
x = df.drop(columns=['price'])
y = df['price']

In [None]:
#_____Split 1 train and test_____
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [None]:
# Standardized features
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)

In [None]:
# Create a DecisionTreeRegressor Model
dt = DecisionTreeRegressor(random_state=42)

# Define the parameter grid
param_grid = {
    'max_depth': [None, 5, 10, 15],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Create GridSearchCV object
grid_search = GridSearchCV(dt, param_grid)

# Perform grid search cross-validation
grid_search.fit(x_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set using the best model
y_pred = best_model.predict(x_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print results
print("Mean Squared Error (MSE): ", mse)
print("Root Mean Squared Error (RMSE): ", rmse)
print("R-squared (R2): ", r2)
print("Mean Absolute Error (MAE): ", mae)

Mean Squared Error (MSE):  432700.12040352373
Root Mean Squared Error (RMSE):  657.7994530276867
R-squared (R2):  0.9724766773091167
Mean Absolute Error (MAE):  344.083729053589


In [None]:
#_____Split 2 train and test_____
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
# Standardized features
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)

In [None]:
# Create a DecisionTreeRegressor Model
dt = DecisionTreeRegressor(random_state=42)

# Define the parameter grid
param_grid = {
    'max_depth': [None, 5, 10, 15],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Create GridSearchCV object
grid_search = GridSearchCV(dt, param_grid)

# Perform grid search cross-validation
grid_search.fit(x_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set using the best model
y_pred = best_model.predict(x_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print results
print("Mean Squared Error (MSE): ", mse)
print("Root Mean Squared Error (RMSE): ", rmse)
print("R-squared (R2): ", r2)
print("Mean Absolute Error (MAE): ", mae)

Mean Squared Error (MSE):  430773.81542468694
Root Mean Squared Error (RMSE):  656.3336159489982
R-squared (R2):  0.9724437853992809
Mean Absolute Error (MAE):  345.8219058042708


In [None]:
#_____Split 3 train and test_____
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Standardized features
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)

In [None]:
# Create a DecisionTreeRegressor Model
dt = DecisionTreeRegressor(random_state=42)

# Define the parameter grid
param_grid = {
    'max_depth': [None, 5, 10, 15],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Create GridSearchCV object
grid_search = GridSearchCV(dt, param_grid)

# Perform grid search cross-validation
grid_search.fit(x_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict on the test set using the best model
y_pred = best_model.predict(x_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print results
print("Mean Squared Error (MSE): ", mse)
print("Root Mean Squared Error (RMSE): ", rmse)
print("R-squared (R2): ", r2)
print("Mean Absolute Error (MAE): ", mae)

Mean Squared Error (MSE):  451573.89202596666
Root Mean Squared Error (RMSE):  671.9924791439013
R-squared (R2):  0.9718143966779575
Mean Absolute Error (MAE):  332.6424861478683


In [None]:
# Train the DecisionTreeRegressor model on the entire dataset
dt = DecisionTreeRegressor(random_state=42)
dt.fit(x, y)

In [None]:
# Save the trained model as a .pkl file using pickle
with open('Decision Tree Regressor.pkl', 'wb') as file:
    pickle.dump(dt, file)