<a href="https://colab.research.google.com/github/amriikk/smartphone-teens/blob/main/HW_3_Data_Prep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import io
import os
import pandas as pd
import numpy as np
from google.colab import files

file_path = "/content/teen_phone_addiction_dataset.csv"

if os.path.exists(file_path):
    print("File found. Loading dataset...")
    df = pd.read_csv(file_path)
else:
    print("File not found. Please upload the dataset.")
    uploaded_file = files.upload()
    file_name = list(uploaded_file.keys())[0]
    df = pd.read_csv(io.BytesIO(uploaded_file[file_name]))

display(df)
# print(list(df.columns))

# Missingness
print(f"\nMissing values present" if df.isna().any().any() else "\nNo missing values found")

# Cleaning the Dataset
col_list_to_drop = ['Name', 'Age', 'Gender', 'Location', 'School_Grade', 'Academic_Performance', 'Social_Interactions', 'Exercise_Hours', 'Anxiety_Level', 'Depression_Level',
                    'Self_Esteem', 'Parental_Control', 'Screen_Time_Before_Bed', 'Time_on_Education', 'Phone_Usage_Purpose', 'Family_Communication', 'Weekend_Usage_Hours']

for col in col_list_to_drop:
  if col in df.columns:
    df.drop(col,axis = 1,inplace = True)
# print(list(df.columns))

# Cardinality
print(f"\nCardinality:\n{df.nunique(dropna=True).sort_values(ascending=False)}")

# Checking outliers
outlier = (df.select_dtypes(include="number").describe().loc[["min", "max"]].T.reset_index().rename(columns={"index": "column"}))
print(f"\nChecking outlying values:\n{outlier}")

# Features
target = 'Addiction_Level'
new_cols = ['Usage_to_Sleep_Ratio', 'Checks_per_App', 'Entertainment_Ratio']
df['Usage_to_Sleep_Ratio'] = df['Daily_Usage_Hours'] / df['Sleep_Hours']
df['Checks_per_App'] = df['Phone_Checks_Per_Day'] / df['Apps_Used_Daily']
df['Entertainment_Ratio'] = (df['Time_on_Social_Media'] + df['Time_on_Gaming']) / df['Daily_Usage_Hours']
df["High_Addiction"] = np.where(df["Addiction_Level"] > 7, 1, 0)

print(f"\nUpdated Dataframe:")
display(df)

File found. Loading dataset...


Unnamed: 0,ID,Name,Age,Gender,Location,School_Grade,Daily_Usage_Hours,Sleep_Hours,Academic_Performance,Social_Interactions,...,Screen_Time_Before_Bed,Phone_Checks_Per_Day,Apps_Used_Daily,Time_on_Social_Media,Time_on_Gaming,Time_on_Education,Phone_Usage_Purpose,Family_Communication,Weekend_Usage_Hours,Addiction_Level
0,1,Shannon Francis,13,Female,Hansonfort,9th,4.0,6.1,78,5,...,1.4,86,19,3.6,1.7,1.2,Browsing,4,8.7,10.0
1,2,Scott Rodriguez,17,Female,Theodorefort,7th,5.5,6.5,70,5,...,0.9,96,9,1.1,4.0,1.8,Browsing,2,5.3,10.0
2,3,Adrian Knox,13,Other,Lindseystad,11th,5.8,5.5,93,8,...,0.5,137,8,0.3,1.5,0.4,Education,6,5.7,9.2
3,4,Brittany Hamilton,18,Female,West Anthony,12th,3.1,3.9,78,8,...,1.4,128,7,3.1,1.6,0.8,Social Media,8,3.0,9.8
4,5,Steven Smith,14,Other,Port Lindsaystad,9th,2.5,6.7,56,4,...,1.0,96,20,2.6,0.9,1.1,Gaming,10,3.7,8.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,Jesus Yates,16,Female,New Jennifer,12th,3.9,6.4,53,4,...,0.3,80,15,2.7,1.8,1.0,Other,8,9.4,9.8
2996,2997,Bethany Murray,13,Female,Richardport,8th,3.6,7.3,93,5,...,0.9,45,8,3.1,0.0,0.3,Gaming,9,5.2,5.5
2997,2998,Norman Hughes,14,Other,Rebeccaton,7th,3.2,6.5,98,1,...,0.2,51,13,2.4,0.2,2.4,Social Media,9,5.9,6.2
2998,2999,Barbara Hinton,17,Female,Ramirezmouth,9th,6.7,7.5,67,3,...,1.6,125,17,1.7,2.6,1.5,Browsing,4,6.1,10.0



No missing values found

Cardinality:
ID                      3000
Phone_Checks_Per_Day     131
Daily_Usage_Hours        107
Addiction_Level           80
Sleep_Hours               71
Time_on_Social_Media      51
Time_on_Gaming            41
Apps_Used_Daily           16
dtype: int64

Checking outlying values:
                 column   min     max
0                    ID   1.0  3000.0
1     Daily_Usage_Hours   0.0    11.5
2           Sleep_Hours   3.0    10.0
3  Phone_Checks_Per_Day  20.0   150.0
4       Apps_Used_Daily   5.0    20.0
5  Time_on_Social_Media   0.0     5.0
6        Time_on_Gaming   0.0     4.0
7       Addiction_Level   1.0    10.0

Updated Dataframe:


Unnamed: 0,ID,Daily_Usage_Hours,Sleep_Hours,Phone_Checks_Per_Day,Apps_Used_Daily,Time_on_Social_Media,Time_on_Gaming,Addiction_Level,Usage_to_Sleep_Ratio,Checks_per_App,Entertainment_Ratio,High_Addiction
0,1,4.0,6.1,86,19,3.6,1.7,10.0,0.655738,4.526316,1.325000,1
1,2,5.5,6.5,96,9,1.1,4.0,10.0,0.846154,10.666667,0.927273,1
2,3,5.8,5.5,137,8,0.3,1.5,9.2,1.054545,17.125000,0.310345,1
3,4,3.1,3.9,128,7,3.1,1.6,9.8,0.794872,18.285714,1.516129,1
4,5,2.5,6.7,96,20,2.6,0.9,8.6,0.373134,4.800000,1.400000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2996,3.9,6.4,80,15,2.7,1.8,9.8,0.609375,5.333333,1.153846,1
2996,2997,3.6,7.3,45,8,3.1,0.0,5.5,0.493151,5.625000,0.861111,0
2997,2998,3.2,6.5,51,13,2.4,0.2,6.2,0.492308,3.923077,0.812500,0
2998,2999,6.7,7.5,125,17,1.7,2.6,10.0,0.893333,7.352941,0.641791,1
