# Data Wrangling for addiction data set

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer


filename = "mobile_addiction.csv"

In [2]:
# Python list headers containing name of headers
headers = [
    "Unnamed: 0",
    "daily_screen_time",
    "app_sessions",
    "social_media_usage",
    "gaming_time",
    "notifications",
    "night_usage",
    "age",
    "work_study_hours",
    "stress_level",
    "apps_installed",
    "addicted"
]


In [3]:
# Pandas method read_csv() to load the data
# Set the parameter "names"

original = pd.read_csv(filename)
df = original.copy()

In [4]:
# To see what the data set looks like, we'll use the head() method.


  # the first five rows of the dataframe.
df.drop(columns=["Unnamed: 0"], inplace=True)
df.head()
# Drop the ID column as it is not needed for analysis

Unnamed: 0,daily_screen_time,app_sessions,social_media_usage,gaming_time,notifications,night_usage,age,work_study_hours,stress_level,apps_installed,addicted
0,2,29,0,0,49,0,44,5,3,35,not addicted
1,6,29,1,2,65,1,29,5,9,21,addicted
2,9,28,2,0,57,3,28,7,5,39,addicted
3,6,39,2,0,69,1,28,6,8,24,addicted
4,5,37,3,1,64,2,27,4,5,26,addicted


In [5]:
print(df.shape)
print(df.size)
print(df.dtypes)

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()


(13589, 11)
149479
daily_screen_time      int64
app_sessions           int64
social_media_usage     int64
gaming_time            int64
notifications          int64
night_usage            int64
age                    int64
work_study_hours       int64
stress_level           int64
apps_installed         int64
addicted              object
dtype: object


In [6]:
ordinal_encoder = OrdinalEncoder(categories=[['not addicted', 'addicted']])
Category_encoded = ordinal_encoder.fit_transform(df[['addicted']])
df['addicted'] = Category_encoded
df.head()

Unnamed: 0,daily_screen_time,app_sessions,social_media_usage,gaming_time,notifications,night_usage,age,work_study_hours,stress_level,apps_installed,addicted
0,2,29,0,0,49,0,44,5,3,35,0.0
1,6,29,1,2,65,1,29,5,9,21,1.0
2,9,28,2,0,57,3,28,7,5,39,1.0
3,6,39,2,0,69,1,28,6,8,24,1.0
4,5,37,3,1,64,2,27,4,5,26,1.0


## **Preprocess Data**

Data is already mostly preprocessed. Just Preform some imputation and Standardization (Not Normalization)
Notice that the transformed data is actually stored in variables other than data (data_num_scaled, data_cat_scaled). This is because we will change the original data in the pipelining portion

In [7]:
print("Missing values (should be 0):\n", df.isnull().sum())

Missing values (should be 0):
 daily_screen_time     0
app_sessions          0
social_media_usage    0
gaming_time           0
notifications         0
night_usage           0
age                   0
work_study_hours      0
stress_level          0
apps_installed        0
addicted              0
dtype: int64


In [8]:

data_num = df[num_cols]
data_cat = df[cat_cols]

imputer_num = SimpleImputer(strategy='median')
data_num_imputed = imputer_num.fit_transform(data_num)

imputer_cat = SimpleImputer(strategy='most_frequent')
data_cat_imputed = imputer_cat.fit_transform(data_cat)


In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_num_scaled = scaler.fit_transform(data_num_imputed)


In [10]:
df_num_scaled = pd.DataFrame(data_num_scaled, columns=num_cols)
df_cat_imputed = pd.DataFrame(data_cat_imputed, columns=cat_cols)

# Combine numeric and categorical data
df_final = pd.concat([df_num_scaled, df_cat_imputed], axis=1)

# Optional: Restore original column order (if necessary)
df_final = df_final[df.columns]

In [11]:

df_final.describe()

Unnamed: 0,daily_screen_time,app_sessions,social_media_usage,gaming_time,notifications,night_usage,age,work_study_hours,stress_level,apps_installed,addicted
count,13589.0,13589.0,13589.0,13589.0,13589.0,13589.0,13589.0,13589.0,13589.0,13589.0,13589.0
mean,-9.411855e-17,-1.691519e-16,5.647113e-17,7.790924000000001e-17,1.192168e-16,-5.647113e-17,4.3921990000000006e-17,9.856303e-17,1.4509940000000002e-17,-2.284989e-16,0.50379
std,1.000037,1.000037,1.000037,1.000037,1.000037,1.000037,1.000037,1.000037,1.000037,1.000037,0.500004
min,-1.986138,-2.975516,-1.283179,-1.041367,-2.750269,-1.046105,-1.784553,-2.886124,-1.863898,-2.974269,0.0
25%,-0.9319584,-0.680177,-0.4531563,-1.041367,-0.7866498,-1.046105,-0.7961937,-0.472198,-0.5529212,-0.7688509,0.0
50%,0.1222209,-0.005077283,-0.4531563,-0.03445536,-0.079747,0.01033977,-0.005505838,0.01058724,-0.1159289,-0.09026065,1.0
75%,0.6493105,0.6700225,0.3768668,0.9724562,0.7842453,1.066785,0.785182,0.4933725,0.7580558,0.5883296,1.0
max,4.338938,4.045521,4.526982,5.000102,3.376222,5.292564,2.168886,3.872869,2.506025,4.659871,1.0


In [12]:
print("Missing values per column:\n", df_final.isnull().sum())
df_final.to_csv("addiction after scaling and imputation.csv", index=False)

Missing values per column:
 daily_screen_time     0
app_sessions          0
social_media_usage    0
gaming_time           0
notifications         0
night_usage           0
age                   0
work_study_hours      0
stress_level          0
apps_installed        0
addicted              0
dtype: int64


<hr>
<p>Copyright &copy; 2018 IBM Developer Skills Network. This notebook and its source code are released under the terms of the <a href="https://cognitiveclass.ai/mit-license/">MIT License</a>.</p>