In [1]:
# necessary libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

In [2]:
data = pd.read_csv('teen_phone_addiction_dataset.csv') # read dataset
print(data.head(10)) # show first 10 row 

   ID               Name  Age  Gender            Location School_Grade  \
0   1    Shannon Francis   13  Female          Hansonfort          9th   
1   2    Scott Rodriguez   17  Female        Theodorefort          7th   
2   3        Adrian Knox   13   Other         Lindseystad         11th   
3   4  Brittany Hamilton   18  Female        West Anthony         12th   
4   5       Steven Smith   14   Other    Port Lindsaystad          9th   
5   6         Mary Adams   13  Female  East Angelachester         10th   
6   7       Hailey Moses   16    Male       North Jeffrey         11th   
7   8  Veronica Marshall   13   Other        Jenniferport         10th   
8   9       Edward Avila   13    Male             Leebury          8th   
9  10       James Carter   18   Other         Prestonview         11th   

   Daily_Usage_Hours  Sleep_Hours  Academic_Performance  Social_Interactions  \
0                4.0          6.1                    78                    5   
1                5.5     

In [3]:
print(data.isnull().sum()) # check is there any null value

ID                        0
Name                      0
Age                       0
Gender                    0
Location                  0
School_Grade              0
Daily_Usage_Hours         0
Sleep_Hours               0
Academic_Performance      0
Social_Interactions       0
Exercise_Hours            0
Anxiety_Level             0
Depression_Level          0
Self_Esteem               0
Parental_Control          0
Screen_Time_Before_Bed    0
Phone_Checks_Per_Day      0
Apps_Used_Daily           0
Time_on_Social_Media      0
Time_on_Gaming            0
Time_on_Education         0
Phone_Usage_Purpose       0
Family_Communication      0
Weekend_Usage_Hours       0
Addiction_Level           0
dtype: int64


In [4]:
print('number of row and column: ')
print(data.shape)         # number of row and column
print('name of columns: ')
print(data.columns)       # column's name
print('data info: ')
print(data.info())        # data type
print('statistical analysis: ')
print(data.describe())    # statistical summary of numerical variable

number of row and column: 
(3000, 25)
name of columns: 
Index(['ID', 'Name', 'Age', 'Gender', 'Location', 'School_Grade',
       'Daily_Usage_Hours', 'Sleep_Hours', 'Academic_Performance',
       'Social_Interactions', 'Exercise_Hours', 'Anxiety_Level',
       'Depression_Level', 'Self_Esteem', 'Parental_Control',
       'Screen_Time_Before_Bed', 'Phone_Checks_Per_Day', 'Apps_Used_Daily',
       'Time_on_Social_Media', 'Time_on_Gaming', 'Time_on_Education',
       'Phone_Usage_Purpose', 'Family_Communication', 'Weekend_Usage_Hours',
       'Addiction_Level'],
      dtype='object')
data info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      3000 non-null   int64  
 1   Name                    3000 non-null   object 
 2   Age                     3000 non-null   int64  
 3   Gender                 

In [5]:
# import libraires for machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
data_cleaned = data.drop(columns = ['ID', 'Name', 'Location']) # remove unnessary rows 

In [7]:
x = data_cleaned.drop(columns=['Addiction_Level']) # independent variables
y = data_cleaned['Addiction_Level'] # dependent variable (addiction level)

In [8]:
categoric_cols = x.select_dtypes(include = ['object', 'category']).columns.tolist()   # categorical columns
numeric_cols = x.select_dtypes(include = ['int64', 'float64']).columns.tolist()   # numeric columns
numeric_cols = [col for col in numeric_cols if col not in categoric_cols] # remove categorical columns from numeric columns

In [9]:
#convert categorical data with one-hot encoding
preprocessor = ColumnTransformer(
    transformers = [
        ('cat', OneHotEncoder(drop = 'first'), categoric_cols)
    ], 
    remainder = 'passthrough' # keep numeric columns as they are
)

In [10]:
# create a machine learning pipeline
# this pipeline will preprocess the data, scale it, and then apply a regression model
model = Pipeline(steps = [
    ('preprocessor', preprocessor),  # preprocess the data
    ('scaler', StandardScaler()),      # scale the data
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))  # regression model
    ]
)

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42) # split the data into train and test sets

In [12]:
model.fit(x_train, y_train) # fit the model on training data

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [13]:
y_pred = model.predict(x_test) # make predictions on test data

In [14]:
# calculate performance metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Root Mean Squared Error (RMSE):", round(rmse, 2))
print("R² Score:", round(r2, 3))

Root Mean Squared Error (RMSE): 0.61
R² Score: 0.849


In [16]:
import pickle
# save the model to a file
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)