# Model Training Notebook

### Load the data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

data =pd.read_csv('../dataset/Final_data.csv')
data

Unnamed: 0,Price,ppi,cpu core,cpu freq,internal mem,RearCam,Front_Cam,battery,thickness,Performance_score,Camera_score,weight_Heavy,weight_Light,weight_Medium
0,1950,187,4.0,1.300,8.0,8.0,2.0,2000,6.4,5.2,6.20,0.0,1.0,0.0
1,2276,294,8.0,1.500,16.0,13.0,5.0,2300,7.8,12.0,10.60,0.0,1.0,0.0
2,2975,534,8.0,1.600,32.0,16.0,8.0,3000,7.0,12.8,13.60,0.0,0.0,1.0
3,1921,184,4.0,1.300,8.0,13.0,8.0,2580,8.0,5.2,11.50,1.0,0.0,0.0
4,1916,312,4.0,1.200,8.0,13.0,5.0,2000,7.6,4.8,10.60,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,2714,401,8.0,1.350,16.0,13.0,5.0,2300,5.1,10.8,10.60,0.0,0.0,1.0
79,3658,428,8.0,2.450,64.0,12.0,8.0,3350,7.5,19.6,10.80,1.0,0.0,0.0
80,3211,534,4.0,1.975,64.0,20.0,8.0,3400,7.9,7.9,16.40,1.0,0.0,0.0
81,2001,258,4.0,1.200,16.0,8.0,1.0,3400,10.2,4.8,5.90,0.0,0.0,1.0


## First we need to split the data

In [2]:
y = data['Price']
x = data.drop(columns = 'Price', axis = 1)

In [3]:
x_train , x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)

## Now we splited the data, lets scale it

In [4]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Our Data is now clean, encoded, scaled,  ready for model training!

### We will use Regression for now but later we can use random forest

In [5]:
regressor = LinearRegression()
regressor.fit(x_train, y_train)

### lets test on training set

In [6]:
y_train_pred = regressor.predict(x_train)

mse = mean_squared_error(y_train, y_train_pred)

mse

46423.52451704118

In [7]:
r2 = r2_score(y_train, y_train_pred)
accuracy_percentage = r2 * 100
print(f"R^2 Score: {r2}")
print(f"Accuracy in percentage: {accuracy_percentage:.2f}%")

R^2 Score: 0.9258153074212236
Accuracy in percentage: 92.58%


## 92.58% R2, WOW!

#### Lets try cross validation score

In [8]:
# Perform 5-fold cross-validation
scores = cross_val_score(regressor, x_train, y_train, cv=5, scoring='r2')
scores

array([0.9254312 , 0.86475606, 0.94875163, 0.81348304, 0.68379282])

# Now let's test on the test set

In [9]:
scores = cross_val_score(regressor, x_test, y_test, cv=5, scoring='r2')
scores

array([-16.20402607,  -1.74856744,  -1.59667145, -20.46248538,
        -3.82768592])

In [10]:
y_predicted = regressor.predict(x_test)

In [11]:
r2 = r2_score(y_test, y_predicted)
accuracy_percentage = r2 * 100
print(f"R^2 Score: {r2}")
print(f"Accuracy in percentage: {accuracy_percentage:.2f}%")

R^2 Score: 0.8770313288935857
Accuracy in percentage: 87.70%


# That’s actually pretty solid! 🔥

## 92.58% on training

## 87.70% on testing

### That means:

    The model is learning well from the training data ✅

    And generalizing decently to unseen data ✅

    There's no major overfitting (only 5% drop between train and test R²)

# lets Save the model

In [13]:
import joblib

joblib.dump(regressor, 'phone_price_model.pkl')

['phone_price_model.pkl']

In [14]:
joblib.dump(scaler, 'scaler.pkl') # the scaler to scale the inputs in deployment

['scaler.pkl']