In [None]:
import pandas as pd
import seaborn as sea
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pickle

In [None]:
# Dataframe
df=pd.read_csv("/content/DiamondsPrice.csv")

In [None]:
#Dropping the outliers.
df = df[(df["depth"]<75)&(df["depth"]>50)]
df = df[(df["table"]<80)&(df["table"]>45)]
df = df[(df["x"]>3)]
df = df[(df["y"]<20)]
df = df[(df["z"]<10)]
df.shape

(53920, 10)

In [None]:
# Create a label encoder object using scikit-learn.
#A tool that helps us convert categorical data into numerical format
le = LabelEncoder()
df['cut']=le.fit_transform(df['cut'])
df['color']=le.fit_transform(df['color'])
df['clarity']=le.fit_transform(df['clarity'])

In [None]:
# Assigning features and target to variables
x = df.drop(columns=['price'])
y = df['price']

In [None]:
#_____Split 1 train and test_____
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

# Standardized features
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)

In [None]:
# Create a LinearRegression model
lr = LinearRegression()

# Fit the model with training data
lr.fit(x_train, y_train)

# Make predictions on the test set
y_pred = lr.predict(x_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print results
print("Mean Squared Error (MSE): ", mse)
print("Root Mean Squared Error (RMSE): ", rmse)
print("R-squared (R2): ", r2)
print("Mean Absolute Error (MAE): ", mae)

Mean Squared Error (MSE):  1759295.4757905495
Root Mean Squared Error (RMSE):  1326.3843620122145
R-squared (R2):  0.8880941908598553
Mean Absolute Error (MAE):  849.9727299410443


In [None]:
#_____Split 2 train and test_____
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Standardized features
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)

In [None]:
# Create a LinearRegression model
lr = LinearRegression()

# Train the model
lr.fit(x_train, y_train)

# Predict on the test set
y_pred = lr.predict(x_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print results
print("Mean Squared Error (MSE): ", mse)
print("Root Mean Squared Error (RMSE): ", rmse)
print("R-squared (R2): ", r2)
print("Mean Absolute Error (MAE): ", mae)

Mean Squared Error (MSE):  1757322.069024805
Root Mean Squared Error (RMSE):  1325.6402487193895
R-squared (R2):  0.8875856834313703
Mean Absolute Error (MAE):  854.3123027914417


In [None]:
#_____Split 3 train and test_____
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Standardized features
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)

In [None]:
# Create a LinearRegression model
lr = LinearRegression()

# Train the model
lr.fit(x_train, y_train)

# Predict on the test set
y_pred = lr.predict(x_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print results
print("Mean Squared Error (MSE): ", mse)
print("Root Mean Squared Error (RMSE): ", rmse)
print("R-squared (R2): ", r2)
print("Mean Absolute Error (MAE): ", mae)

Mean Squared Error (MSE):  1818028.5105662188
Root Mean Squared Error (RMSE):  1348.3428757427462
R-squared (R2):  0.8865252590288444
Mean Absolute Error (MAE):  856.0018763047972


In [None]:
# Train the LinearRegressor model on the entire dataset
lr = LinearRegression()
lr.fit(x, y)

In [None]:
# Save the trained model as a .pkl file using pickle
with open('Linear Regression.pkl', 'wb') as file:
    pickle.dump(lr, file)