# Data Wrangling for addiction data set

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer


filename = "mobile_addiction.csv"

In [10]:
# Python list headers containing name of headers
headers = [
    "Unnamed: 0",
    "daily_screen_time",
    "app_sessions",
    "social_media_usage",
    "gaming_time",
    "notifications",
    "night_usage",
    "age",
    "work_study_hours",
    "stress_level",
    "apps_installed",
    "addicted"
]


In [11]:
# Pandas method read_csv() to load the data
# Set the parameter "names"

original = pd.read_csv(filename)
df = original.copy()

In [13]:
# To see what the data set looks like, we'll use the head() method.


  # the first five rows of the dataframe.
df.drop(columns=["Unnamed: 0"], inplace=True)
df.head()
# Drop the ID column as it is not needed for analysis

Unnamed: 0,daily_screen_time,app_sessions,social_media_usage,gaming_time,notifications,night_usage,age,work_study_hours,stress_level,apps_installed,addicted
0,2,29,0,0,49,0,44,5,3,35,not addicted
1,6,29,1,2,65,1,29,5,9,21,addicted
2,9,28,2,0,57,3,28,7,5,39,addicted
3,6,39,2,0,69,1,28,6,8,24,addicted
4,5,37,3,1,64,2,27,4,5,26,addicted


In [14]:
print(df.shape)
print(df.size)
print(df.dtypes)


(13589, 11)
149479
daily_screen_time      int64
app_sessions           int64
social_media_usage     int64
gaming_time            int64
notifications          int64
night_usage            int64
age                    int64
work_study_hours       int64
stress_level           int64
apps_installed         int64
addicted              object
dtype: object


In [15]:
ordinal_encoder = OrdinalEncoder(categories=[['not addicted', 'addicted']])
Category_encoded = ordinal_encoder.fit_transform(df[['addicted']])
df['addicted'] = Category_encoded
df.head()

Unnamed: 0,daily_screen_time,app_sessions,social_media_usage,gaming_time,notifications,night_usage,age,work_study_hours,stress_level,apps_installed,addicted
0,2,29,0,0,49,0,44,5,3,35,0.0
1,6,29,1,2,65,1,29,5,9,21,1.0
2,9,28,2,0,57,3,28,7,5,39,1.0
3,6,39,2,0,69,1,28,6,8,24,1.0
4,5,37,3,1,64,2,27,4,5,26,1.0


## **Preprocess Data**

Data is already mostly preprocessed. Just Preform some imputation and Standardization (Not Normalization)
Notice that the transformed data is actually stored in variables other than data (data_num_scaled, data_cat_scaled). This is because we will change the original data in the pipelining portion

In [16]:
print("Missing values (should be 0):\n", df.isnull().sum())

Missing values (should be 0):
 daily_screen_time     0
app_sessions          0
social_media_usage    0
gaming_time           0
notifications         0
night_usage           0
age                   0
work_study_hours      0
stress_level          0
apps_installed        0
addicted              0
dtype: int64


In [23]:

num_cols = df.select_dtypes(include=[np.number]).columns
imputer = SimpleImputer(strategy='median')
data_imputed = imputer.fit_transform(df[num_cols])
data_num_imputed = pd.DataFrame(data_imputed, columns=num_cols)


In [24]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_num_scaled = scaler.fit_transform(data_num_imputed)


In [20]:
df_final = pd.DataFrame(data_imputed, columns=num_cols)



In [21]:

df_final.describe()

Unnamed: 0,daily_screen_time,app_sessions,social_media_usage,gaming_time,notifications,night_usage,age,work_study_hours,stress_level,apps_installed,addicted
count,13589.0,13589.0,13589.0,13589.0,13589.0,13589.0,13589.0,13589.0,13589.0,13589.0,13589.0
mean,3.768121,30.037604,1.545956,1.034219,60.015306,0.990213,33.055707,5.97807,4.265288,27.532048,0.50379
std,1.89728,7.406585,1.20483,0.993172,12.732064,0.946606,10.118145,2.07139,2.288454,5.89479,0.500004
min,0.0,8.0,0.0,0.0,25.0,0.0,15.0,0.0,0.0,10.0,0.0
25%,2.0,25.0,1.0,0.0,50.0,0.0,25.0,5.0,3.0,23.0,0.0
50%,4.0,30.0,1.0,1.0,59.0,1.0,33.0,6.0,4.0,27.0,1.0
75%,5.0,35.0,2.0,2.0,70.0,2.0,41.0,7.0,6.0,31.0,1.0
max,12.0,60.0,7.0,6.0,103.0,6.0,55.0,14.0,10.0,55.0,1.0


In [27]:
print("Missing values per column:\n", df_final.isnull().sum())
df_final.to_csv("addiction after scaling and imputation.csv", index=False)

Missing values per column:
 daily_screen_time     0
app_sessions          0
social_media_usage    0
gaming_time           0
notifications         0
night_usage           0
age                   0
work_study_hours      0
stress_level          0
apps_installed        0
addicted              0
dtype: int64


<hr>
<p>Copyright &copy; 2018 IBM Developer Skills Network. This notebook and its source code are released under the terms of the <a href="https://cognitiveclass.ai/mit-license/">MIT License</a>.</p>