In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('/content/user_behavior_dataset.csv')
data.head()

Unnamed: 0,User ID,Device Model,Operating System,App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender,User Behavior Class
0,1,Google Pixel 5,Android,393,6.4,1872,67,1122,40,Male,4
1,2,OnePlus 9,Android,268,4.7,1331,42,944,47,Female,3
2,3,Xiaomi Mi 11,Android,154,4.0,761,32,322,42,Male,2
3,4,Google Pixel 5,Android,239,4.8,1676,56,871,20,Male,3
4,5,iPhone 12,iOS,187,4.3,1367,58,988,31,Female,3


In [3]:
def summarize(df):
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#unique'] = df.nunique().values
    summ['#missing'] = df.isnull().sum().values
    summ['#duplicate'] = df.duplicated().sum()
    desc = pd.DataFrame(df.describe(include='all').transpose()) # getting the stats

    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['mean'] = desc['mean'].values
    summ['std dev'] = desc['std'].values
    summ['top value'] = desc['top'].values
    summ['Freq'] = desc['freq'].values
    return summ

    # formatting parameters for the sake of better representation
cell_hover = {
    'selector': 'td:hover',
    'props': [('background-color', '#ffffb3')]
}
index_names = {
    'selector': '.index_name',
    'props': 'font-style: italic; color: darkgrey; font-weight:normal;'
}
headers = {
    'selector': 'th:not(.index_name)',
    'props': 'background-color: #2e0478; color: white;'
}

# summarizing the basic properties of dataset
summarize(data).style.set_table_styles([cell_hover, index_names, headers]).background_gradient(cmap='Purples').set_caption('Raw data statistics')

Unnamed: 0,data type,#unique,#missing,#duplicate,min,max,mean,std dev,top value,Freq
User ID,int64,700,0,0,1.0,700.0,350.5,202.21688,,
Device Model,object,5,0,0,,,,,Xiaomi Mi 11,146.0
Operating System,object,2,0,0,,,,,Android,554.0
App Usage Time (min/day),int64,387,0,0,30.0,598.0,271.128571,177.199484,,
Screen On Time (hours/day),float64,108,0,0,1.0,12.0,5.272714,3.068584,,
Battery Drain (mAh/day),int64,628,0,0,302.0,2993.0,1525.158571,819.136414,,
Number of Apps Installed,int64,86,0,0,10.0,99.0,50.681429,26.943324,,
Data Usage (MB/day),int64,585,0,0,102.0,2497.0,929.742857,640.451729,,
Age,int64,42,0,0,18.0,59.0,38.482857,12.012916,,
Gender,object,2,0,0,,,,,Male,364.0


**Key Insights from the Dataset** ✨

1) **App Usage & Screen Time** 📱:

The dataset shows considerable variation in App Usage Time (min/day) and Screen On Time (hours/day), with users spending between 30 minutes to 598 minutes on apps daily and an average of 5.27 hours with the screen on. This indicates diverse usage patterns, which can be used for behavioral profiling.

2) **Battery Usage** 🔋:

Battery Drain (mAh/day) ranges from 302 mAh to 2993 mAh, with an average of 1525.16 mAh. This suggests that mobile battery consumption varies significantly, possibly correlating with user activity and screen time.

3) **Data Usage** 📶:

Data Usage (MB/day) ranges from 102 MB to 2497 MB, with an average of 929.74 MB. This highlights varying data consumption, which could be influenced by the number of apps installed and the frequency of app usage.


**Summary** 📍:

The dataset reflects diverse user behavior, with significant variation in app usage, screen time, and battery drain. Understanding these variations can help in segmenting users based on their activity levels, and in optimizing mobile services or targeting specific user groups.

**Explorative Data Analysis** 💡


**What are the expectations from EDA? 📊🔍**

**Univariate Analysis** 🔢

Basic statistics of the dataset, e.g., (univariate histograms → distributions of the dataset) 📈
Can we determine the battery life duration 🔋 of each user's device? ⏳
What is the daily cycle 🔄 for device recharging 🔌? Does a specific device tend to have faster battery drain ⚡ and recharging 🔌?

**Multivariate Analysis** 🔀

Sensitivity analysis 🔬 (How are the features correlated to one another? 🔄)

Individual insights 💡



**Univariate Analysis**

In [6]:
data_4_analysis = data.drop(['User ID', 'User Behavior Class'], axis=1) # for analysis purpose

# Separate columns by their types
num_cols = data_4_analysis.select_dtypes(include=['number']).columns.tolist()
cat_cols = data_4_analysis.select_dtypes(include=['object']).columns.tolist()

# Calculate number of rows and columns for subplots
num_rows_num = (len(num_cols) + 2) // 3  # Create rows based on the number of numerical columns (3 columns per row)
num_rows_cat = (len(cat_cols) + 2) // 3  # Create rows based on the number of categorical columns (3 columns per row)

# Create subplots: one for numerical and one for categorical data
fig = make_subplots(
    rows=num_rows_num + num_rows_cat, cols=3,
    subplot_titles=[f"{col}" for col in num_cols] +
                   [f"{col}" for col in cat_cols],
    column_widths=[0.33, 0.33, 0.33]  # Evenly distribute the width across 3 columns
)

# Add histograms for numerical columns
for i, col in enumerate(num_cols):
    row = (i // 3) + 1  # Determine the row for the subplot
    col_in_subplot = (i % 3) + 1  # Determine the column for the subplot
    fig.add_trace(
        go.Histogram(x=data_4_analysis[col], name=col, opacity=0.75),
        row=row, col=col_in_subplot
    )

# Add histograms for categorical columns (count of each category)
for i, col in enumerate(cat_cols):
    row = num_rows_num + (i // 3) + 1  # Adjust row number for categorical columns
    col_in_subplot = (i % 3) + 1  # Determine the column for the subplot
    # Ensure there's data in the column
    if data[col].notna().sum() > 0:
        fig.add_trace(
            go.Histogram(x=data_4_analysis[col], name=col, histfunc='count', opacity=0.75),
            row=row, col=col_in_subplot
        )

# Update layout with increased height and better spacing
fig.update_layout(
    title="Histograms of Numerical and Categorical Data",
    barmode='overlay',
    showlegend=False,
    height=600 + (num_rows_num * 250) + (num_rows_cat * 250),  # Adjust height based on number of rows
    width=1000,  # Increased width to accommodate 3 columns
    bargap=0.1,  # Space between bars
    title_x=0.5,  # Center title
    plot_bgcolor='white',  # White background for the entire plot
    paper_bgcolor='white',  # White background for the paper as well
    hovermode="x unified",  # Unified hovermode for better interaction
    title_font=dict(size=20)
)

# Show the plot
fig.show()

The data appears to be generally well-distributed and balanced across most features. However, the Operating System feature stands out as an exception. Specifically, only the iPhone 12 supports iOS, while the remaining four types of devices use Android OS.

To make a fair comparison across device models, we should consider normalizing the data according to the number of device models. For instance, if we assume that Android is equally distributed across the four device types, the total count of 554 devices could be approximately divided by 4 (i.e., 554 ÷ 4 ≈ 140 devices per model). This would allow for a more balanced comparison, where the Operating System feature can be evaluated more equitably across different device models.

Thus, by normalizing the data in this way, we can expect the distribution of the Operating System to become more even and comparable across the different devices.

Let's check if there is an outlier or not in the dataset:

In [7]:
num_cols = data_4_analysis.select_dtypes(include=['number']).columns.tolist()

# Function to detect outliers using the IQR method
def detect_outliers(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers

# Iterate through all columns in the dataset
for column in num_cols:
    # Check if the column is numeric (useful for detecting outliers)
    if pd.api.types.is_numeric_dtype(data_4_analysis[column]):
        # Detect outliers
        outliers = detect_outliers(data_4_analysis[column])

        # Plot boxplot if there are outliers
        if not outliers.empty:
            fig = px.box(data, y=column, title=f'Boxplot of {column} with Outliers')
            fig.show()
        else:
            print(f"{column} has : no outliers")

App Usage Time (min/day) has : no outliers
Screen On Time (hours/day) has : no outliers
Battery Drain (mAh/day) has : no outliers
Number of Apps Installed has : no outliers
Data Usage (MB/day) has : no outliers
Age has : no outliers


**Battery degradation research**

Most of the researchers suggest that the Li-ion battery capacity drops drastically after the 80 % of battery health.

OBJECTIVE: Therefore, it is useful to analyze which devices are prone to reach the degradation (80%) threshhold earlier.

Besides the objective, we can add more usefull features that can be derived from our dataset:

avg_power_consumption - Average Power consumption
Battery Life - How long the battery can last untill it is fully drained
cycle_number_per_day - Number of cycles per day


Explanation of Calculations:

The **Average Power Consumption** is calculated as:

avg\_power\_consumption
=
Battery Drain (mAh/day)
Screen On Time (hours/day)

This equation represents the average power consumed by the device during its screen-on time. The Battery Drain (mAh/day) is divided by the Screen On Time (hours/day) to obtain how much power is used on average per hour.

The **Battery Life** is given by:

Battery life
=
capacity
avg\_power\_consumption

The **Battery Life** is calculated by dividing the device's capacity (in mAh) by the average power consumption. This gives an estimate of how many hours the device can operate before the battery is fully drained, assuming constant power consumption.

And **the Cycle Number Per Day** is:

cycle\_number\_per\_day
=
Battery Drain (mAh/day)
capacity

The **Cycle Number Per Day** represents how many full charge cycles the device undergoes in one day. By dividing the daily **Battery Drain (mAh/day)** by the **capacity** of the battery, we get the number of times the device’s battery is drained completely within a day.



In [8]:
capacity = {'Google Pixel 5': 4080, 'OnePlus 9': 4500,
            'Xiaomi Mi 11': 4600, 'iPhone 12': 2815, 'Samsung Galaxy S21':4000}  # Capacity for each model were added from the reference: https://m.gsmarena.com/

data['capacity'] = data['Device Model'].apply(lambda x: capacity[x] if x in capacity else 'NaN')
data['avg_power_consumption'] = data['Battery Drain (mAh/day)']/data['Screen On Time (hours/day)']

data['Battery life'] = data['capacity']/data['avg_power_consumption']
data[data['Battery life'] < data['Screen On Time (hours/day)']]
data['cycle_number_per_day'] = data['Battery Drain (mAh/day)']/data['capacity']
data

Unnamed: 0,User ID,Device Model,Operating System,App Usage Time (min/day),Screen On Time (hours/day),Battery Drain (mAh/day),Number of Apps Installed,Data Usage (MB/day),Age,Gender,User Behavior Class,capacity,avg_power_consumption,Battery life,cycle_number_per_day
0,1,Google Pixel 5,Android,393,6.4,1872,67,1122,40,Male,4,4080,292.500000,13.948718,0.458824
1,2,OnePlus 9,Android,268,4.7,1331,42,944,47,Female,3,4500,283.191489,15.890308,0.295778
2,3,Xiaomi Mi 11,Android,154,4.0,761,32,322,42,Male,2,4600,190.250000,24.178712,0.165435
3,4,Google Pixel 5,Android,239,4.8,1676,56,871,20,Male,3,4080,349.166667,11.684964,0.410784
4,5,iPhone 12,iOS,187,4.3,1367,58,988,31,Female,3,2815,317.906977,8.854792,0.485613
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,696,iPhone 12,iOS,92,3.9,1082,26,381,22,Male,2,2815,277.435897,10.146488,0.384369
696,697,Xiaomi Mi 11,Android,316,6.8,1965,68,1201,59,Male,4,4600,288.970588,15.918575,0.427174
697,698,Google Pixel 5,Android,99,3.1,942,22,457,50,Female,2,4080,303.870968,13.426752,0.230882
698,699,Samsung Galaxy S21,Android,62,1.7,431,13,224,44,Male,1,4000,253.529412,15.777262,0.107750


Research Paper: Takeno, Kazuhiko, Masahiro Ichimura, Kazuo Takano, and Junichi Yamaki. "Influence of cycle capacity deterioration and storage capacity deterioration on Li-ion batteries used in mobile phones." Journal of power sources 142, no. 1-2 (2005): 298-305.

According to the paper, after about 400 cycles the Li-ion battery health reduces to the 80%.

If this is the case, let's analyze the devices that can reach the 80 % of battery health in the different number of cycles and also correlation of user's behaviour that causes for earlier battery degradation.

(Note that, it was assumed that all the users have 100 % overall battery health during the all calculation presented here)

In [9]:
data['cycle_number_in_400days'] = data['cycle_number_per_day']* 400

fig = px.scatter(data, x='capacity', y='cycle_number_in_400days', color='Device Model', size='Screen On Time (hours/day)')

fig.add_shape(type="line",
              x0=data['capacity'].min()-500, x1=data['capacity'].max()+500,  # Extend the line across the plot
              y0=400, y1=400,  # Position the line at y = 400
              line=dict(color="red", width=2, dash="dash"),
              name="Threshold at 400")

# Update layout with increased height and better spacing
fig.update_layout(
    title="Battery health degradation",
    plot_bgcolor='white',  # White background for the entire plot
    paper_bgcolor='white',  # White background for the paper as well
    hovermode="y unified",  # Unified hovermode for better interaction
    title_font=dict(size=20))

fig.show()


**Key insights**:

It seems that the iPhone 12 has the faster rate of battery degradation compared to the other models
Also, it is clear from the figure that the more time spent on the screen the faster the degradation occurs (distinguished by the size of markers in the plot)

**Multivariate Analysis**

Sensitivity analysis by pairplot

In [10]:
import plotly.express as px

# Create the scatter matrix plot
fig = px.scatter_matrix(data,
    dimensions=['App Usage Time (min/day)', 'Screen On Time (hours/day)', 'Battery Drain (mAh/day)', 'Data Usage (MB/day)',
                 'cycle_number_per_day', 'Age'],
    color='User Behavior Class'
)

# Update layout with increased height and better spacing
fig.update_layout(
    title="Pairplot of Highlighted Features",
    height=500 + (num_rows_num * 250) + (num_rows_cat * 250),  # Adjust height based on number of rows
    width=1200,  # Increased width to accommodate 3 columns
    bargap=0.02,  # Space between bars
    title_x=0.5,  # Center title
    plot_bgcolor='lightgray',  # Light gray background for the entire plot
    paper_bgcolor='lightgray',  # Light gray background for the paper as well
    hovermode="x unified",  # Unified hovermode for better interaction
    title_font=dict(size=20, color='darkblue'),  # Change title font size and color
    xaxis=dict(showgrid=True, gridcolor='gray', zeroline=False),  # Add gridlines to the x-axis
    yaxis=dict(showgrid=True, gridcolor='gray', zeroline=False),  # Add gridlines to the y-axis
    font=dict(family="Arial", size=10, color="black"),  # Set font family and color
)

# Update the marker colors (points in the scatter matrix)
fig.update_traces(
    marker=dict(
        color=data['User Behavior Class'],  # Color points based on User Behavior Class
        colorscale='Viridis',  # You can change the colorscale, e.g., 'Viridis', 'Cividis', 'Jet', etc.
        size=10,  # Adjust marker size
        opacity=0.7,  # Set the opacity (transparency) of the points
    )
)

# Show the plot
fig.show()

**Insights from the figure**

Every data point has correlation to the certain degree with at least one of the features in the whole dataset
As the colored markers describe, the User Behaviour Class can easily be disinguished. Therefore, it is not that difficult to achieve highly accurate ML model for predicting behaviour of users.

Interestingly, derived cycle_number_per_day feature is showing three different linear behaviour with respect to Battery Drain (mAh/day) feature that's most probably connected to the capacity of each device models.

**Model Training**

User behaviour classification

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report

In [12]:
# Data Preparation
X = data.drop(['User ID', 'User Behavior Class'], axis=1)
y = data['User Behavior Class']

# Separating numerical and categorical values
cat_cols = X.select_dtypes(include=['object']).columns.values
num_cols = X.select_dtypes(include=np.number).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ])

# List of classifiers to try
classifiers = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=200),
    'SVM': SVC(),
    'DecisionTree': DecisionTreeClassifier(),
    'KNeighbors': KNeighborsClassifier()
}

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Store the scores for radar plot
scores = {'precision': {}, 'recall': {}, 'f1-score': {}}

# Loop through each classifier
for name, clf in classifiers.items():
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', clf)
    ])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Get classification report
    report = classification_report(y_test, y_pred, output_dict=True)

    # Collect average metrics for radar plots (macro avg for comparison across models)
    scores['precision'][name] = report['macro avg']['precision']
    scores['recall'][name] = report['macro avg']['recall']
    scores['f1-score'][name] = report['macro avg']['f1-score']

# Convert scores to DataFrame for plotting
scores_df = pd.DataFrame(scores)

# Create a subplot with 1 row and 3 columns
fig = make_subplots(rows=1, cols=3, subplot_titles=list(scores.keys()), specs=[[{'type': 'polar'}] * 3])

# Loop over each metric to create radar plots for each
for i, metric in enumerate(scores.keys()):
    for model_name in scores_df.columns:
        fig.add_trace(
            go.Scatterpolar(
                r=scores_df[model_name].values,
                theta=scores_df.index,
                fill='toself',
                name=model_name
            ),
            row=1, col=i+1
        )

# Update layout for better readability
fig.update_layout(
    title="Comparison of Different Models on Classification Metrics",
    showlegend=True,
    legend_title="Models",
    height=500,
    width=1000
)

fig.show()

Model training conclusion:

5 different classification models were compared for using default hyperparameters: Random Forest, Logistic Regression, Support Vector Machines, DecisionTree, KNeighbors

All 5 models' performances were compared in terms of precision, recall, f1-score metrics

As the dataset does not have complexity in distinguishing the User Behaviour Class (refer to the Multivariative analysis section), all the models showing the same maximum precision