In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [4]:
data = pd.read_excel(r"FANCY BASE REPORT/lattest_Base_Report WITHOUT NULL VALUES.xlsx")

In [5]:
data

Unnamed: 0,Shape,From Size,Clarity,Color,Cut,Fluor,Current,9_2,5_2,29_1,...,19th,19th_pcs,20th,20th_pcs,Avg5,Avg10,Avg25,Avg35,Avg50,TOTAL_PCS
0,CUSHION,0.7,VVS1,D,VGOOD,NONE,66.0,66.0,67.0,67.0,...,0.0,0,0.0,0,71.6,69.9,64.08,62.33,62.33,27
1,CUSHION,0.7,VVS1,D,VGOOD,FAINT,73.5,73.5,73.5,73.5,...,0.0,0,0.0,0,74.4,74.0,72.44,70.74,66.40,48
2,CUSHION,0.7,VVS1,D,VGOOD,MEDIUM,77.5,77.5,78.5,78.5,...,32.0,1,0.0,2,75.4,75.2,74.16,73.26,71.40,67
3,CUSHION,0.7,VVS1,D,VGOOD,STRONG,81.5,81.5,82.5,82.5,...,47.0,1,32.0,1,75.4,75.2,74.40,73.69,72.02,72
4,CUSHION,0.7,VVS1,D,GOOD,NONE,69.0,69.0,69.0,69.0,...,0.0,0,0.0,0,71.6,69.9,64.08,62.33,62.33,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27692,RADIANT,2.7,SI2,H,VGOOD,STRONG,61.0,61.0,61.0,58.0,...,0.0,0,0.0,0,30.6,25.1,21.15,21.15,21.15,13
27693,RADIANT,2.7,SI2,I,VGOOD,NONE,46.0,46.0,46.0,46.0,...,0.0,0,0.0,0,34.0,28.5,19.45,19.45,19.45,20
27694,RADIANT,2.7,SI2,I,VGOOD,FAINT,50.0,50.0,50.0,50.0,...,0.0,0,0.0,0,34.0,28.5,19.29,19.29,19.29,21
27695,RADIANT,2.7,SI2,I,VGOOD,MEDIUM,57.0,57.0,57.0,55.0,...,0.0,0,0.0,0,39.0,31.5,20.73,20.73,20.73,22


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27697 entries, 0 to 27696
Data columns (total 60 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Shape      27697 non-null  object 
 1   From Size  27697 non-null  float64
 2   Clarity    27697 non-null  object 
 3   Color      27697 non-null  object 
 4   Cut        27697 non-null  object 
 5   Fluor      27697 non-null  object 
 6   Current    27697 non-null  float64
 7   9_2        27697 non-null  float64
 8   5_2        27697 non-null  float64
 9   29_1       27697 non-null  float64
 10  22_1       27697 non-null  float64
 11  15_1       27697 non-null  float64
 12  8_1        27697 non-null  float64
 13  1_1        27697 non-null  float64
 14  1st        27697 non-null  float64
 15  1st_pcs    27697 non-null  int64  
 16  2nd        27697 non-null  float64
 17  2nd_pcs    27697 non-null  int64  
 18  3rd        27697 non-null  float64
 19  3rd_pcs    27697 non-null  int64  
 20  4th   

In [7]:
categorical_columns = data.select_dtypes(include=['object']).columns.tolist()
categorical_columns

['Shape', 'Clarity', 'Color', 'Cut', 'Fluor ']

In [8]:
def label_encode_categorical_columns(data_frame, categorical_columns):
    le = LabelEncoder()
    for column in categorical_columns:
        data_frame[column] = le.fit_transform(data_frame[column])
    return data_frame

In [9]:
df = label_encode_categorical_columns(data, categorical_columns)

In [10]:
df

Unnamed: 0,Shape,From Size,Clarity,Color,Cut,Fluor,Current,9_2,5_2,29_1,...,19th,19th_pcs,20th,20th_pcs,Avg5,Avg10,Avg25,Avg35,Avg50,TOTAL_PCS
0,0,0.7,4,0,1,2,66.0,66.0,67.0,67.0,...,0.0,0,0.0,0,71.6,69.9,64.08,62.33,62.33,27
1,0,0.7,4,0,1,0,73.5,73.5,73.5,73.5,...,0.0,0,0.0,0,74.4,74.0,72.44,70.74,66.40,48
2,0,0.7,4,0,1,1,77.5,77.5,78.5,78.5,...,32.0,1,0.0,2,75.4,75.2,74.16,73.26,71.40,67
3,0,0.7,4,0,1,3,81.5,81.5,82.5,82.5,...,47.0,1,32.0,1,75.4,75.2,74.40,73.69,72.02,72
4,0,0.7,4,0,0,2,69.0,69.0,69.0,69.0,...,0.0,0,0.0,0,71.6,69.9,64.08,62.33,62.33,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27692,6,2.7,1,4,1,3,61.0,61.0,61.0,58.0,...,0.0,0,0.0,0,30.6,25.1,21.15,21.15,21.15,13
27693,6,2.7,1,5,1,2,46.0,46.0,46.0,46.0,...,0.0,0,0.0,0,34.0,28.5,19.45,19.45,19.45,20
27694,6,2.7,1,5,1,0,50.0,50.0,50.0,50.0,...,0.0,0,0.0,0,34.0,28.5,19.29,19.29,19.29,21
27695,6,2.7,1,5,1,1,57.0,57.0,57.0,55.0,...,0.0,0,0.0,0,39.0,31.5,20.73,20.73,20.73,22


In [11]:
X = df.drop(['Current'], axis = 1)
y = df['Current']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=108)

In [13]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (19387, 59)
Shape of y_train: (19387,)
Shape of X_test: (8310, 59)
Shape of y_test: (8310,)


In [20]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_regression', LinearRegression()),
#     ('random_forest', RandomForestRegressor())
#     ('decision_tree', DecisionTreeRegressor())
])

pipeline.fit(X_train, y_train)

In [21]:
predictions = pipeline.predict(X_test)

In [29]:
# Evaluate the performance using mean squared error
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.9492574071382441


In [23]:
y_train

8481     40.0
3837     55.0
12393    40.0
20065    47.5
18103    61.0
         ... 
26740    71.0
8786     43.0
13678    63.0
10699    44.0
8337     39.0
Name: Current, Length: 19387, dtype: float64

In [24]:
predictions

array([42.62520261, 38.08740601, 74.48257528, ..., 58.33290594,
       55.95525226, 62.05385203])