### Load the Dataset

In [1]:
import pandas as pd
import numpy as np
import streamlit as st
import re

In [None]:
df = pd.read_csv('Mobiles_Dataset.csv')
# df = pd.read_csv('Cleaned_Mobiles_Dataset.csv')

### Explore the dataset

In [11]:
df.head()

Unnamed: 0,Product Name,Actual price,Discount price,Stars,Rating,Reviews,RAM (GB),Storage (GB),Display Size (inch),Camera,Description,Link
0,"Apple iPhone 15 (Green, 128 GB)","₹79,600","₹65,999",4.6,"44,793 Ratings","2,402 Reviews",NIL,128,6.1,48MP + 12MP,128 GB ROM15.49 cm (6.1 inch) Super Retina XDR...,https://www.flipkart.com/apple-iphone-15-green...
1,"Apple iPhone 15 (Blue, 128 GB)","₹79,600","₹65,999",4.6,"44,793 Ratings","2,402 Reviews",NIL,128,6.1,48MP + 12MP,128 GB ROM15.49 cm (6.1 inch) Super Retina XDR...,https://www.flipkart.com/apple-iphone-15-blue-...
2,"Apple iPhone 15 (Black, 128 GB)","₹79,600","₹65,999",4.6,"44,793 Ratings","2,402 Reviews",NIL,128,6.1,48MP + 12MP,128 GB ROM15.49 cm (6.1 inch) Super Retina XDR...,https://www.flipkart.com/apple-iphone-15-black...
3,"OnePlus N20 SE (JADE WAVE, 128 GB)","₹19,999","₹11,489",4.0,"1,005 Ratings",41 Reviews,4,128,6.56,50MP,4 GB RAM | 128 GB ROM16.66 cm (6.56 inch) Disp...,https://www.flipkart.com/oneplus-n20-se-jade-w...
4,"OnePlus N20 SE (BLUE OASIS, 64 GB)","₹16,999","₹12,999",4.0,"1,005 Ratings",41 Reviews,4,64,6.56,50MP,4 GB RAM | 64 GB ROM16.66 cm (6.56 inch) Displ...,https://www.flipkart.com/oneplus-n20-se-blue-o...


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984 entries, 0 to 983
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Product Name         984 non-null    object 
 1   Actual price         984 non-null    object 
 2   Discount price       984 non-null    object 
 3   Stars                984 non-null    float64
 4   Rating               984 non-null    object 
 5   Reviews              984 non-null    object 
 6   RAM (GB)             984 non-null    object 
 7   Storage (GB)         984 non-null    object 
 8   Display Size (inch)  984 non-null    float64
 9   Camera               908 non-null    object 
 10  Description          984 non-null    object 
 11  Link                 984 non-null    object 
dtypes: float64(2), object(10)
memory usage: 92.4+ KB


### Preprocess the Dataset

In [14]:
#change the name of the branch "I KALL" to "IKALL" for better brand extraction
df['Product Name'] = df['Product Name'].str.replace('I kall', 'IKall', case=False)

#extract the brandnames and then save it another column
df['Brand'] = df['Product Name'].str.split().str[0]
df['Brand'] = df['Brand'].str.lower()

#drop the decription and links column
df = df.drop('Description', axis=1)
df = df.drop('Link', axis=1)

In [15]:
# Clean 'Actual price' column
df['Actual price'] = (
    df['Actual price']
    .str.replace('₹', '', regex=False)
    .str.replace(',', '', regex=False)
)
df['Actual price'] = pd.to_numeric(df['Actual price'], errors='coerce')

# Clean 'Discount price' column
df['Discount price'] = (
    df['Discount price']
    .str.replace('₹', '', regex=False)
    .str.replace(',', '', regex=False)
)
df['Discount price'] = pd.to_numeric(df['Discount price'], errors='coerce')

In [17]:
#drop rows with missing prices
df.dropna(subset=['Actual price', 'Discount price'], inplace=True)

# Convert to integer
df['Actual price'] = df['Actual price'].astype(int)
df['Discount price'] = df['Discount price'].astype(int)


In [18]:
#fix ram and storage columns
df['RAM (GB)'] = df['RAM (GB)'].replace('NIL', np.nan).astype(float)
df['Storage (GB)'] = df['Storage (GB)'].replace('NIL', np.nan).astype(float)


In [19]:
#cleaning the reviws and rating columns
df['Rating'] = df['Rating'].str.replace(' Ratings', '').str.replace(',', '').astype(int)
df['Reviews'] = df['Reviews'].str.replace(' Reviews', '').str.replace(',', '').astype(int)

In [20]:
#workaround to extract primary camera lens from the camera column
def extract_primary_camera(mp_str):
    if pd.isna(mp_str):
        return None
    match = re.search(r'(\d+)', mp_str) #search for numbers
    return int(match.group(1)) if match else None #return the first number if found else none

df['Primary Camera (MP)'] = df['Camera'].apply(extract_primary_camera)

#drop the camera column
df = df.drop('Camera', axis=1)

In [21]:
df.head()
# df.tail()

Unnamed: 0,Product Name,Actual price,Discount price,Stars,Rating,Reviews,RAM (GB),Storage (GB),Display Size (inch),Brand,Primary Camera (MP)
0,"Apple iPhone 15 (Green, 128 GB)",79600,65999,4.6,44793,2402,,128.0,6.1,apple,48.0
1,"Apple iPhone 15 (Blue, 128 GB)",79600,65999,4.6,44793,2402,,128.0,6.1,apple,48.0
2,"Apple iPhone 15 (Black, 128 GB)",79600,65999,4.6,44793,2402,,128.0,6.1,apple,48.0
3,"OnePlus N20 SE (JADE WAVE, 128 GB)",19999,11489,4.0,1005,41,4.0,128.0,6.56,oneplus,50.0
4,"OnePlus N20 SE (BLUE OASIS, 64 GB)",16999,12999,4.0,1005,41,4.0,64.0,6.56,oneplus,50.0


In [22]:
df.describe()


Unnamed: 0,Actual price,Discount price,Stars,Rating,Reviews,RAM (GB),Storage (GB),Display Size (inch),Primary Camera (MP)
count,930.0,930.0,930.0,930.0,930.0,774.0,826.0,930.0,857.0
mean,25263.134409,20023.831183,4.249892,22843.284946,1450.666667,128.608527,179.228814,6.019953,40.693116
std,22257.240721,18090.048296,0.195385,51729.880364,2889.385691,2380.869118,103.619407,1.60595,28.497423
min,1199.0,809.0,3.4,4.0,0.0,2.0,0.0,0.66,0.0
25%,11999.0,8706.75,4.2,697.0,42.25,6.0,128.0,6.5,12.0
50%,19999.0,15858.5,4.2,5560.0,346.0,8.0,128.0,6.67,50.0
75%,30999.0,25826.5,4.4,17356.0,1414.0,8.0,256.0,6.72,50.0
max,149999.0,129999.0,4.7,429459.0,23258.0,46875.0,512.0,7.82,200.0


In [23]:
df.shape

(930, 11)

In [24]:
df.isnull().sum()

Product Name             0
Actual price             0
Discount price           0
Stars                    0
Rating                   0
Reviews                  0
RAM (GB)               156
Storage (GB)           104
Display Size (inch)      0
Brand                    0
Primary Camera (MP)     73
dtype: int64

In [25]:
#fill the null values by the median value of each branch in each category

# Fill RAM based on brand-wise median
df['RAM (GB)'] = df.groupby('Brand')['RAM (GB)'].transform(lambda x: x.fillna(x.median()))

# Fill Storage based on brand-wise median
df['Storage (GB)'] = df.groupby('Brand')['Storage (GB)'].transform(lambda x: x.fillna(x.median()))

# Fill Primary Camera (MP) based on brand-wise median
df['Primary Camera (MP)'] = df.groupby('Brand')['Primary Camera (MP)'].transform(lambda x: x.fillna(x.median()))


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [26]:
df.isnull().sum()

Product Name            0
Actual price            0
Discount price          0
Stars                   0
Rating                  0
Reviews                 0
RAM (GB)               78
Storage (GB)           17
Display Size (inch)     0
Brand                   0
Primary Camera (MP)     0
dtype: int64

In [27]:
df[df['RAM (GB)'].isnull()]['Brand'].value_counts()

Brand
apple        39
kechaoda     20
karbonn       9
blackzone     3
ikall         3
vox           2
jio           2
Name: count, dtype: int64

In [30]:
df[df['Storage (GB)'].isnull()]['Brand'].value_counts()

Brand
karbonn      9
ikall        3
blackzone    3
vox          2
Name: count, dtype: int64

In [33]:
#drop the brands that have Storage as null because the are less significant
#use meadin to fill null values for the brands that have RAM as NULL beacuse they are significant for our data

# Step 1: Identify brands with NULL in Storage (GB)
brands_with_null_storage = df[df['Storage (GB)'].isnull()]['Brand'].unique()

# Step 2: Drop these brands
df = df[~df['Brand'].isin(brands_with_null_storage)]

# Step 3: Fill RAM (GB) nulls with global median
df['RAM (GB)'] = df['RAM (GB)'].fillna(df['RAM (GB)'].median())


In [34]:
df.isnull().sum()

Product Name           0
Actual price           0
Discount price         0
Stars                  0
Rating                 0
Reviews                0
RAM (GB)               0
Storage (GB)           0
Display Size (inch)    0
Brand                  0
Primary Camera (MP)    0
dtype: int64

In [35]:
df.head()

Unnamed: 0,Product Name,Actual price,Discount price,Stars,Rating,Reviews,RAM (GB),Storage (GB),Display Size (inch),Brand,Primary Camera (MP)
0,"Apple iPhone 15 (Green, 128 GB)",79600,65999,4.6,44793,2402,8.0,128.0,6.1,apple,48.0
1,"Apple iPhone 15 (Blue, 128 GB)",79600,65999,4.6,44793,2402,8.0,128.0,6.1,apple,48.0
2,"Apple iPhone 15 (Black, 128 GB)",79600,65999,4.6,44793,2402,8.0,128.0,6.1,apple,48.0
3,"OnePlus N20 SE (JADE WAVE, 128 GB)",19999,11489,4.0,1005,41,4.0,128.0,6.56,oneplus,50.0
4,"OnePlus N20 SE (BLUE OASIS, 64 GB)",16999,12999,4.0,1005,41,4.0,64.0,6.56,oneplus,50.0


In [38]:
df.shape

(913, 11)

### Add a segment column to the dataset

In [3]:
def classify_segment(price):
    if price < 10000:
        return 'Budget'
    elif price <= 30000:
        return 'Mid-Range'
    else:
        return 'Flagship'

df['Segment'] = df['Actual price'].apply(classify_segment)


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Product Name,Actual price,Discount price,Stars,Rating,Reviews,RAM (GB),Storage (GB),Display Size (inch),Brand,Primary Camera (MP),Segment
0,0,"Apple iPhone 15 (Green, 128 GB)",79600,65999,4.6,44793,2402,8.0,128.0,6.1,apple,48.0,Flagship
1,1,"Apple iPhone 15 (Blue, 128 GB)",79600,65999,4.6,44793,2402,8.0,128.0,6.1,apple,48.0,Flagship
2,2,"Apple iPhone 15 (Black, 128 GB)",79600,65999,4.6,44793,2402,8.0,128.0,6.1,apple,48.0,Flagship
3,3,"OnePlus N20 SE (JADE WAVE, 128 GB)",19999,11489,4.0,1005,41,4.0,128.0,6.56,oneplus,50.0,Mid-Range
4,4,"OnePlus N20 SE (BLUE OASIS, 64 GB)",16999,12999,4.0,1005,41,4.0,64.0,6.56,oneplus,50.0,Mid-Range


### Save the Dataset

In [5]:
df.to_csv('Cleaned_Mobiles_Dataset.csv')