In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Step 2: Data Preparation and Cleaning
# Load the dataset into a DataFrame
df = pd.read_csv('phone_usage_india.csv')
print(df.shape)

(17686, 16)


In [3]:
# Explore the dataset: Number of rows, columns, and basic info
print("Dataset Info:")
print(df.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17686 entries, 0 to 17685
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   User ID                       17686 non-null  object 
 1   Age                           17686 non-null  int64  
 2   Gender                        17686 non-null  object 
 3   Location                      17686 non-null  object 
 4   Phone Brand                   17686 non-null  object 
 5   OS                            17686 non-null  object 
 6   Screen Time (hrs/day)         17686 non-null  float64
 7   Data Usage (GB/month)         17686 non-null  float64
 8   Calls Duration (mins/day)     17686 non-null  float64
 9   Number of Apps Installed      17686 non-null  int64  
 10  Social Media Time (hrs/day)   17686 non-null  float64
 11  E-commerce Spend (INR/month)  17686 non-null  int64  
 12  Streaming Time (hrs/day)      17686 non-null  

In [7]:
print("\nNumber of rows and columns:", df.shape)
print(df.isnull().sum())


Number of rows and columns: (17686, 16)
User ID                         0
Age                             0
Gender                          0
Location                        0
Phone Brand                     0
OS                              0
Screen Time (hrs/day)           0
Data Usage (GB/month)           0
Calls Duration (mins/day)       0
Number of Apps Installed        0
Social Media Time (hrs/day)     0
E-commerce Spend (INR/month)    0
Streaming Time (hrs/day)        0
Gaming Time (hrs/day)           0
Monthly Recharge Cost (INR)     0
Primary Use                     0
dtype: int64


In [8]:
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
df_numeric = df[numeric_columns]

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the numeric data
df_scaled = scaler.fit_transform(df_numeric)

# Convert the result back to a DataFrame
df_scaled = pd.DataFrame(df_scaled, columns=numeric_columns)

# If needed, merge back with the original categorical columns
df_final = pd.concat([df.drop(columns=numeric_columns), df_scaled], axis=1)

In [9]:
print(df_final.head(10))

  User ID  Gender   Location Phone Brand       OS    Primary Use       Age  \
0  U00001    Male     Mumbai        Vivo  Android      Education  1.155788   
1  U00002   Other      Delhi      Realme      iOS         Gaming  1.680609   
2  U00003  Female  Ahmedabad       Nokia  Android  Entertainment -0.043804   
3  U00004    Male       Pune     Samsung  Android  Entertainment -0.418676   
4  U00005    Male     Mumbai      Xiaomi      iOS   Social Media -1.618267   
5  U00006    Male     Jaipur        Oppo      iOS  Entertainment -1.243395   
6  U00007  Female    Lucknow       Apple      iOS   Social Media  1.455686   
7  U00008   Other    Kolkata      Realme      iOS  Entertainment  1.380711   
8  U00009  Female    Kolkata        Oppo  Android  Entertainment  0.630967   
9  U00010   Other    Kolkata       Apple      iOS      Education  0.481018   

   Screen Time (hrs/day)  Data Usage (GB/month)  Calls Duration (mins/day)  \
0              -0.897178              -0.107016                

In [10]:

df_final.to_csv('new_phoneusage_data.csv', index=False)
df.describe()

Unnamed: 0,Age,Screen Time (hrs/day),Data Usage (GB/month),Calls Duration (mins/day),Number of Apps Installed,Social Media Time (hrs/day),E-commerce Spend (INR/month),Streaming Time (hrs/day),Gaming Time (hrs/day),Monthly Recharge Cost (INR)
count,17686.0,17686.0,17686.0,17686.0,17686.0,17686.0,17686.0,17686.0,17686.0,17686.0
mean,37.584247,6.546376,25.411257,151.405846,104.584869,3.252369,5075.707848,4.250616,2.490874,1042.785367
std,13.338252,3.172677,14.122167,84.923353,55.217097,1.590223,2871.604841,2.155683,1.446003,552.502067
min,15.0,1.0,1.0,5.0,10.0,0.5,100.0,0.5,0.0,100.0
25%,26.0,3.8,13.2,77.325,57.0,1.9,2587.5,2.4,1.2,561.0
50%,38.0,6.6,25.3,150.6,104.0,3.2,5052.0,4.2,2.5,1040.0
75%,49.0,9.3,37.6,223.9,152.0,4.6,7606.0,6.1,3.7,1521.75
max,60.0,12.0,50.0,300.0,200.0,6.0,10000.0,8.0,5.0,2000.0


In [11]:
df.tail()

Unnamed: 0,User ID,Age,Gender,Location,Phone Brand,OS,Screen Time (hrs/day),Data Usage (GB/month),Calls Duration (mins/day),Number of Apps Installed,Social Media Time (hrs/day),E-commerce Spend (INR/month),Streaming Time (hrs/day),Gaming Time (hrs/day),Monthly Recharge Cost (INR),Primary Use
17681,U17682,27,Other,Mumbai,Apple,iOS,7.2,36.6,90.4,81,5.1,4007,3.8,3.3,1380,Entertainment
17682,U17683,40,Female,Chennai,Oppo,iOS,9.5,12.9,243.0,166,4.3,8550,3.4,0.7,222,Education
17683,U17684,34,Female,Ahmedabad,Realme,Android,1.1,48.9,74.7,70,5.3,5516,0.7,4.0,1612,Entertainment
17684,U17685,22,Male,Hyderabad,Vivo,Android,8.8,25.6,105.6,96,2.4,3614,6.6,0.8,1528,Work
17685,U17686,43,Other,Kolkata,Oppo,iOS,5.6,22.5,215.2,78,5.1,5332,0.8,3.6,1098,Gaming
