In [40]:
## Import all necessary libraries here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sweetviz
import plotly.express as px
import warnings
from sklearn import preprocessing
warnings.filterwarnings("ignore")

In [41]:
#upload the dataset into pandas dataframe
dataset = pd.read_csv("data/mobile_prices_2023.csv",
                    names=["phone_name","phone_rating","number_ratings","ram","rom","back_camera_specs","front_camera_specs","battery","processor","price_inr","date_scraped"],
                    skiprows=1,
                    thousands=',',
                    parse_dates=["date_scraped"])
#printing the initial dataset
dataset.head(5)

Unnamed: 0,phone_name,phone_rating,number_ratings,ram,rom,back_camera_specs,front_camera_specs,battery,processor,price_inr,date_scraped
0,"POCO C50 (Royal Blue, 32 GB)",4.2,33561,2 GB RAM,32 GB ROM,8MP Dual Camera,5MP Front Camera,5000 mAh,"Mediatek Helio A22 Processor, Upto 2.0 GHz Pro...","₹5,649",2023-06-17
1,"POCO M4 5G (Cool Blue, 64 GB)",4.2,77128,4 GB RAM,64 GB ROM,50MP + 2MP,8MP Front Camera,5000 mAh,Mediatek Dimensity 700 Processor,"₹11,999",2023-06-17
2,"POCO C51 (Royal Blue, 64 GB)",4.3,15175,4 GB RAM,64 GB ROM,8MP Dual Rear Camera,5MP Front Camera,5000 mAh,Helio G36 Processor,"₹6,999",2023-06-17
3,"POCO C55 (Cool Blue, 64 GB)",4.2,22621,4 GB RAM,64 GB ROM,50MP Dual Rear Camera,5MP Front Camera,5000 mAh,Mediatek Helio G85 Processor,"₹7,749",2023-06-17
4,"POCO C51 (Power Black, 64 GB)",4.3,15175,4 GB RAM,64 GB ROM,8MP Dual Rear Camera,5MP Front Camera,5000 mAh,Helio G36 Processor,"₹6,999",2023-06-17


The structure of the data and formats seems fine. So we can look at some statistics of the dataset.

In [42]:
print("This dataset has ",dataset.shape[0],"rows and ",dataset.shape[1],"columns.")
print("The columns in the dataset has the following datatypes: ")
print(dataset.dtypes)

This dataset has  1836 rows and  11 columns.
The columns in the dataset has the following datatypes: 
phone_name                    object
phone_rating                 float64
number_ratings                 int64
ram                           object
rom                           object
back_camera_specs             object
front_camera_specs            object
battery                       object
processor                     object
price_inr                     object
date_scraped          datetime64[ns]
dtype: object


In [43]:
#We need to replace the rupee(₹) sign in price_inr column.

dataset["price_inr"] = dataset["price_inr"].str.replace("₹","")
dataset["price_inr"] = dataset["price_inr"].str.replace(",","")
dataset["price_inr"] = dataset["price_inr"].astype("double")

In [44]:
#We also need to update the Ram column to a numerical column to be used in regression. Let's replace the 'GB RAM' string in the dataset
print("before replacement")
dataset["ram_cleaned"] = dataset["ram"]
print(dataset["ram_cleaned"].unique())
str_replace_list = ["GB","MB","KB","TB","ROM","RAM","Expandable","Upto","Display","cm","Display","inch","(1.5 )"]
for ls in str_replace_list:
    dataset["ram_cleaned"] = dataset["ram_cleaned"].str.replace(ls,"")
dataset["ram_cleaned"] = dataset["ram_cleaned"].str.replace("NA","0")
print("after replacement")
dataset["ram_cleaned"] = dataset["ram_cleaned"].str.strip()
dataset["ram_cleaned"] = dataset["ram_cleaned"].replace('',np.nan).fillna(0)
print(dataset["ram_cleaned"].unique())
dataset["ram_cleaned"] = dataset["ram_cleaned"].astype("double")


before replacement
['2 GB RAM' '4 GB RAM' '6 GB RAM' '3 GB RAM' '8 GB RAM' '12 GB RAM'
 '128 GB ROM' '64 GB ROM' '256 GB ROM' '512 GB ROM' '16 GB ROM'
 '32 GB ROM' '8 GB ROM' '1 TB ROM' 'NA ROM' 'Expandable Upto 16 GB'
 '8 MB RAM' '0 MB ROM' '4 MB RAM' '0.53 RAM' '153 MB RAM' 'NA MB ROM'
 '1 GB RAM' '16 MB RAM' '3.81 cm (1.5 inch) Display' 'cm Display'
 '32 MB RAM' '128 MB RAM' '16 MB ROM' '48 MB RAM' '0.046875 GB RAM'
 '32 MB ROM' '64 MB RAM' '8 MB ROM' '768 MB RAM' '1 MB RAM' '16 GB RAM'
 '512 MB RAM' '1.5 GB RAM' '1 MB ROM' '256 MB RAM' '60 MB ROM'
 'Expandable Upto 2 GB' '2 MB ROM' '80 MB ROM' '50 MB ROM']
after replacement
['2' '4' '6' '3' '8' '12' '128' '64' '256' '512' '16' '32' '1' '0' '0.53'
 '153' '3.81' 0 '48' '0.046875' '768' '1.5' '60' '80' '50']


In [45]:
#We also need to update the Ram column to a numerical column to be used in regression. Let's replace the 'GB RAM' string in the dataset
print("before replacement")
dataset["rom_cleaned"] = dataset["rom"]
print(dataset["rom_cleaned"].unique())
for ls in str_replace_list:
    dataset["rom_cleaned"] = dataset["rom_cleaned"].str.replace(ls,"")
dataset["rom_cleaned"] = dataset["rom_cleaned"].str.replace("NA","0")
print("after replacement")
dataset["rom_cleaned"] = dataset["rom_cleaned"].str.strip()
dataset["rom_cleaned"] = dataset["rom_cleaned"].replace('',np.nan).fillna(0)
print(dataset["rom_cleaned"].unique())
dataset["rom_cleaned"] = dataset["rom_cleaned"].astype("double")


before replacement
['32 GB ROM' '64 GB ROM' '128 GB ROM' '256 GB ROM' nan '0 GB ROM'
 'Expandable Upto 16 GB' '4 MB ROM' 'NA KB ROM' '153 MB ROM' '512 GB ROM'
 '8 GB ROM' '2 MB ROM' '2.27 MB ROM' '16 GB ROM' '32 MB ROM' '16 MB ROM'
 '48 MB ROM' 'Expandable Upto 32 GB' '128 MB ROM' '0.125 GB ROM'
 '10 MB ROM' 'NA ROM' '4 GB ROM' '20 MB ROM' '100 KB ROM' '100 MB ROM'
 'Expandable Upto 8 GB']
after replacement
['32' '64' '128' '256' 0 '0' '16' '4' '153' '512' '8' '2' '2.27' '48'
 '0.125' '10' '20' '100']


In [46]:
print(dataset["battery"].unique())
print("Replacing the mAh from column 'battery'")
battery_replace_str = ['A15',' A13', 'A14', 'A16', 'A12','A13', 'A9', 'Apple','mAh','MediaTek ','Brand ','Unisoc ']
for bat_ls in battery_replace_str:
    dataset["battery"] = dataset["battery"].str.replace(bat_ls,"")
dataset["battery"] = dataset["battery"].str.strip()
dataset["battery"] = dataset["battery"].replace('',np.nan).fillna(0)
dataset["battery"] = dataset["battery"].astype("int")
print(dataset["battery"].unique())

['5000 mAh' '4500 mAh' '4000 mAh' '5160 mAh' '6000 mAh' '5065 mAh'
 '4200 mAh' '4300 mAh' '4230 mAh' 'A15 mAh' 'A13 mAh' 'A14 mAh' 'A16 mAh'
 'A12 mAh' 'A9 mAh' 'Apple mAh' '1 mAh' '800 mAh' '0 mAh' '3300 mAh'
 '3900 mAh' '7000 mAh' '3700 mAh' '4700 mAh' '3000 mAh' '1500 mAh'
 '1000 mAh' nan '2000 mAh' '2600 mAh' '2050 mAh' '4800 mAh' '4015 mAh'
 '4025 mAh' '3400 mAh' '3765 mAh' '4350 mAh' '4310 mAh' '3200 mAh'
 '4020 mAh' '4410 mAh' '4270 mAh' '4926 mAh' '2915 mAh' '4600 mAh'
 '4030 mAh' '4050 mAh' '4810 mAh' '4830 mAh' '4870 mAh' '4450 mAh'
 '3315 mAh' '3225 mAh' '3260 mAh' '4250 mAh' '4520 mAh' '3080 mAh'
 '3120 mAh' '5018 mAh' '4100 mAh' '5020 mAh' '4850 mAh' '3030 mAh'
 '3010 mAh' '4820 mAh' '4780 mAh' '5300 mAh' '1020 mAh' 'MediaTek mAh'
 'Brand mAh' '1200 mAh' '1450 mAh' '1150 mAh' '5050 mAh' '3060 mAh'
 'Unisoc mAh' '4950 mAh' '12 mAh' '2150 mAh' '950 mAh' '1430 mAh'
 '1110 mAh' '2 mAh' '4400 mAh' '4610 mAh' '4115 mAh' '3800 mAh' '4085 mAh'
 '2100 mAh' '3340 mAh' '3450 mAh' '23

In [47]:
dataset["phone_name"] = dataset["phone_name"].str.lower()
dataset[['phone_model', 'phone_specs']] = dataset['phone_name'].str.split('(', n=1, expand=True)
dataset[["phone_brand","misc"]] = dataset["phone_model"].str.split(" ",n=1,expand=True)

The above data cleaning should be enough to get started with the basic EDA for our Phone Price Prediciton model.    

In [48]:
final_dataset = dataset[["phone_brand","battery","ram_cleaned","rom_cleaned","phone_rating","number_ratings","price_inr","date_scraped"]]

### EDA

In [49]:
print(final_dataset["date_scraped"].unique()[0])
print("All the products were scraped on the same date of 17th June 2023.")

2023-06-17 00:00:00
All the products were scraped on the same date of 17th June 2023.


In [50]:
px.box(final_dataset,x="price_inr")

The data is heavily right-skewed with a minimum of ~1k to maximum of ~169k with a median value of ~16k. This distribution will be relative to the brand so the below plot shows the price distribution considering all brands.

In [51]:
px.box(final_dataset,x="price_inr",color="phone_brand")

The above chart clearly depicts the vast distribution skewness in the data across brands. It will be meaningful to convert the 'brand' column into a integer value using value encoding.

In [52]:
label_encoder = preprocessing.LabelEncoder()
final_dataset["encoded_phone_brand"] =  label_encoder.fit_transform(final_dataset["phone_brand"])

In [53]:
px.box(final_dataset,x="phone_rating",color="phone_brand")

In [54]:
px.box(final_dataset,x="number_ratings",color="phone_brand")

In [55]:
px.scatter(final_dataset,x="rom_cleaned",y="price_inr")

In [56]:
px.scatter(final_dataset,x="ram_cleaned",y="price_inr")

In [57]:
px.scatter(final_dataset,x="number_ratings",y="price_inr")

In [58]:
px.scatter_matrix(final_dataset,dimensions=["battery","ram_cleaned","rom_cleaned","phone_rating","number_ratings","price_inr","encoded_phone_brand"])

The EDA shows us a pattern connecting the dependent variables like ram, rom with the final price, which is also intuitive. 

### Data Transformation

In [59]:
columns_list = ["battery","ram_cleaned","rom_cleaned","phone_rating","number_ratings","encoded_phone_brand","price_inr"]
final_dataset = final_dataset[columns_list]

In [60]:
final_dataset.corr()

Unnamed: 0,battery,ram_cleaned,rom_cleaned,phone_rating,number_ratings,encoded_phone_brand,price_inr
battery,1.0,-0.522885,0.481313,-0.014308,-0.018485,0.504275,-0.365732
ram_cleaned,-0.522885,1.0,-0.268442,0.091704,-0.000346,-0.452596,0.570556
rom_cleaned,0.481313,-0.268442,1.0,-0.01911,-0.216688,0.37477,0.138898
phone_rating,-0.014308,0.091704,-0.01911,1.0,0.144971,-0.00133,0.164694
number_ratings,-0.018485,-0.000346,-0.216688,0.144971,1.0,0.037489,-0.107356
encoded_phone_brand,0.504275,-0.452596,0.37477,-0.00133,0.037489,1.0,-0.333853
price_inr,-0.365732,0.570556,0.138898,0.164694,-0.107356,-0.333853,1.0


The variables are not highly correleated.

In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_dataset[["battery","ram_cleaned","rom_cleaned","phone_rating","number_ratings","encoded_phone_brand"]], final_dataset["price_inr"],train_size=0.8,random_state=73)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1468, 6)
(1468,)
(368, 6)
(368,)


In [62]:

from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler()
scaled_X_train = pd.DataFrame(X_scaler.fit_transform(X_train),columns=['battery','ram_cleaned','rom_cleaned','phone_rating','number_ratings','encoded_phone_brand'])
scaled_X_test = pd.DataFrame(X_scaler.fit_transform(X_test),columns=['battery','ram_cleaned','rom_cleaned','phone_rating','number_ratings','encoded_phone_brand'])
Y_scaler = MinMaxScaler()
scaled_Y_train = pd.DataFrame(Y_scaler.fit_transform(y_train.to_numpy().reshape(-1,1)),columns=["price_inr"])
scaled_Y_test = pd.DataFrame(Y_scaler.fit_transform(y_test.to_numpy().reshape(-1,1)),columns=["price_inr"])


### Model Fitting: Linear Regression

In [63]:
import statsmodels.api as sm

In [64]:
X_train_sm = sm.add_constant(scaled_X_train)

In [65]:
LinReg = sm.OLS(scaled_Y_train,X_train_sm).fit()

In [66]:
LinReg.params

const                  0.029129
battery               -0.117247
ram_cleaned            0.559832
rom_cleaned            0.423551
phone_rating           0.122526
number_ratings        -0.055857
encoded_phone_brand   -0.063980
dtype: float64

In [67]:
LinReg.summary()

0,1,2,3
Dep. Variable:,price_inr,R-squared:,0.529
Model:,OLS,Adj. R-squared:,0.527
Method:,Least Squares,F-statistic:,273.4
Date:,"Sat, 05 Aug 2023",Prob (F-statistic):,1.19e-234
Time:,22:39:45,Log-Likelihood:,1319.8
No. Observations:,1468,AIC:,-2626.0
Df Residuals:,1461,BIC:,-2589.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0291,0.021,1.369,0.171,-0.013,0.071
battery,-0.1172,0.014,-8.533,0.000,-0.144,-0.090
ram_cleaned,0.5598,0.022,25.178,0.000,0.516,0.603
rom_cleaned,0.4236,0.022,19.584,0.000,0.381,0.466
phone_rating,0.1225,0.023,5.297,0.000,0.077,0.168
number_ratings,-0.0559,0.036,-1.562,0.118,-0.126,0.014
encoded_phone_brand,-0.0640,0.011,-5.885,0.000,-0.085,-0.043

0,1,2,3
Omnibus:,985.535,Durbin-Watson:,2.088
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29003.992
Skew:,2.684,Prob(JB):,0.0
Kurtosis:,24.103,Cond. No.,23.0


In [68]:
# Add a constant to X_test
X_test_sm = sm.add_constant(scaled_X_test)
# Predict the y values corresponding to X_test_sm
y_pred = LinReg.predict(X_test_sm)
y_pred_actual = Y_scaler.inverse_transform(y_pred.to_numpy().reshape(-1,1)).flatten().astype("int")

In [70]:
from sklearn.metrics import r2_score,mean_squared_error
print(r2_score(scaled_Y_test,y_pred))
print(mean_squared_error(scaled_Y_test,y_pred))
      

0.43286567173621215
0.012352823702062076
