In [386]:
import pandas as pd

Read data

In [387]:
data = pd.read_csv('data/user_behavior_dataset.csv')

In [388]:
data.head()

Unnamed: 0,User ID,Device Model,Operating System,App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender,User Behavior Class
0,1,Google Pixel 5,Android,393,6.4,1872,67,1122,40,Male,4
1,2,OnePlus 9,Android,268,4.7,1331,42,944,47,Female,3
2,3,Xiaomi Mi 11,Android,154,4.0,761,32,322,42,Male,2
3,4,Google Pixel 5,Android,239,4.8,1676,56,871,20,Male,3
4,5,iPhone 12,iOS,187,4.3,1367,58,988,31,Female,3


Balanced class?

In [389]:
data['User Behavior Class'].value_counts()

2    146
3    143
4    139
5    136
1    136
Name: User Behavior Class, dtype: int64

Extract target column

In [390]:
Y = data['User Behavior Class']
data.drop('User Behavior Class', axis=1, inplace=True)

Null columns?

In [391]:
data.isna().sum()

User ID                       0
Device Model                  0
Operating System              0
App Usage Time (min/day)      0
Screen On Time (hours/day)    0
Battery Drain (mAh/day)       0
Number of Apps Installed      0
Data Usage (MB/day)           0
Age                           0
Gender                        0
dtype: int64

Unuseful columns

In [392]:
data.drop('User ID', axis=1, inplace=True)

In [393]:
data.head()

Unnamed: 0,Device Model,Operating System,App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender
0,Google Pixel 5,Android,393,6.4,1872,67,1122,40,Male
1,OnePlus 9,Android,268,4.7,1331,42,944,47,Female
2,Xiaomi Mi 11,Android,154,4.0,761,32,322,42,Male
3,Google Pixel 5,Android,239,4.8,1676,56,871,20,Male
4,iPhone 12,iOS,187,4.3,1367,58,988,31,Female


Other columns are balanced?

In [394]:
data['Device Model'].value_counts()

Xiaomi Mi 11          146
iPhone 12             146
Google Pixel 5        142
OnePlus 9             133
Samsung Galaxy S21    133
Name: Device Model, dtype: int64

In [395]:
data['Operating System'].value_counts() 

Android    554
iOS        146
Name: Operating System, dtype: int64

In [396]:
data['Gender'].value_counts()

Male      364
Female    336
Name: Gender, dtype: int64

Apply encoders

In [397]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

In [398]:
label_encoder_OS = LabelEncoder()
data['Operating System'] = label_encoder_OS.fit_transform(data['Operating System'])

label_encoder_gender = LabelEncoder()
data['Gender'] = label_encoder_gender.fit_transform(data['Gender'])

In [399]:
data['Device Model']

0          Google Pixel 5
1               OnePlus 9
2            Xiaomi Mi 11
3          Google Pixel 5
4               iPhone 12
              ...        
695             iPhone 12
696          Xiaomi Mi 11
697        Google Pixel 5
698    Samsung Galaxy S21
699             OnePlus 9
Name: Device Model, Length: 700, dtype: object

In [400]:
onehot_device_model = OneHotEncoder(sparse_output=False)
device_model = onehot_device_model.fit_transform(data[['Device Model']])

device_model = pd.DataFrame(device_model, columns=onehot_device_model.get_feature_names_out())

data = pd.concat([device_model, data], axis=1)

data.drop('Device Model', axis=1, inplace=True)

In [401]:
data.head()

Unnamed: 0,Device Model_Google Pixel 5,Device Model_OnePlus 9,Device Model_Samsung Galaxy S21,Device Model_Xiaomi Mi 11,Device Model_iPhone 12,Operating System,App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender
0,1.0,0.0,0.0,0.0,0.0,0,393,6.4,1872,67,1122,40,1
1,0.0,1.0,0.0,0.0,0.0,0,268,4.7,1331,42,944,47,0
2,0.0,0.0,0.0,1.0,0.0,0,154,4.0,761,32,322,42,1
3,1.0,0.0,0.0,0.0,0.0,0,239,4.8,1676,56,871,20,1
4,0.0,0.0,0.0,0.0,1.0,1,187,4.3,1367,58,988,31,0


Normalization

In [402]:
minmax = MinMaxScaler()

data = pd.DataFrame(minmax.fit_transform(data), columns=data.columns)

data.head()

Unnamed: 0,Device Model_Google Pixel 5,Device Model_OnePlus 9,Device Model_Samsung Galaxy S21,Device Model_Xiaomi Mi 11,Device Model_iPhone 12,Operating System,App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender
0,1.0,0.0,0.0,0.0,0.0,0.0,0.639085,0.490909,0.583426,0.640449,0.425887,0.536585,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.419014,0.336364,0.382386,0.359551,0.351566,0.707317,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.21831,0.272727,0.170569,0.247191,0.091858,0.585366,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.367958,0.345455,0.510591,0.516854,0.321086,0.04878,1.0
4,0.0,0.0,0.0,0.0,1.0,1.0,0.276408,0.3,0.395764,0.539326,0.369937,0.317073,0.0


Save the data

In [403]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(data, Y, test_size=0.07, random_state=42)

In [404]:
X_train.shape, X_test.shape

((650, 13), (50, 13))

In [405]:
X_train.to_csv('data/X_train.csv', index=False)
X_test.to_csv('data/X_test.csv', index=False)

Y_train.to_csv('data/Y_train.csv', index=False)
Y_test.to_csv('data/Y_test.csv', index=False)