In [20]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Note:: The make sure you use the information from your specific PostgreSQL installation
host = r'127.0.0.1' # denotes that the db in a local installation
db = r'MSDS610' # db we just created
user = r'postgres' # using the postgres user for this demo
pw = r'pdahal001' # this is the password established during installation
port = r'5432' # default port estabalished during install

In [3]:
db_conn = create_engine("postgresql://{}:{}@{}:{}/{}".format(user, pw, host, port, db))

In [4]:
table_name = r'mobile_cleaned'
schema = r'cleaned' # schema were the data was loaded last week.

df = pd.read_sql_table(table_name, db_conn, schema)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 620 entries, 0 to 619
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   User_ID                       620 non-null    float64
 1   Age                           620 non-null    float64
 2   Gender                        620 non-null    object 
 3   Total_App_Usage_Hours         620 non-null    float64
 4   Daily_Screen_Time_Hours       620 non-null    float64
 5   Number_of_Apps_Used           620 non-null    float64
 6   Social_Media_Usage_Hours      620 non-null    float64
 7   Productivity_App_Usage_Hours  620 non-null    float64
 8   Gaming_App_Usage_Hours        620 non-null    float64
 9   Location                      620 non-null    object 
dtypes: float64(8), object(2)
memory usage: 48.6+ KB


In [6]:
df.head()

Unnamed: 0,User_ID,Age,Gender,Total_App_Usage_Hours,Daily_Screen_Time_Hours,Number_of_Apps_Used,Social_Media_Usage_Hours,Productivity_App_Usage_Hours,Gaming_App_Usage_Hours,Location
0,1.0,56.0,male,2.61,7.15,24.0,4.43,0.55,2.4,los angeles
1,2.0,46.0,male,2.13,13.79,18.0,4.67,4.42,2.43,chicago
2,3.0,32.0,female,7.28,4.5,11.0,4.58,1.71,2.83,houston
3,8.0,40.0,male,9.53,8.85,11.0,2.66,0.28,2.93,chicago
4,10.0,28.0,male,1.8,4.11,16.0,2.14,1.2,0.96,new york


# Analytical Question:

- Can we predict whether a user has high or low mobile screen time based on their app usage behavior?

# Feature Engineering:

I'll create the following features:

1.  App Diversity Score

Defined as Number_of_Apps_Used / Total_App_Usage_Hours</br>
Higher diversity means a user spreads usage across many apps. Lower score indicates heavy usage on fewer apps.

In [7]:
#App Diversity Score
df["App_Diversity_Score"] = df["Number_of_Apps_Used"] / df["Total_App_Usage_Hours"]

2. Social Media Proportion

Defined Social_Media_Usage_Hours / Total_App_Usage_Hours</br>
Measures how much of the total app usage is spent on social media.

In [8]:
# Social Media Proportion
df["Social_Media_Proportion"] = df["Social_Media_Usage_Hours"] / df["Total_App_Usage_Hours"]

3. Productivity to Entertainment Ratio

Productivity_App_Usage_Hours / (Gaming_App_Usage_Hours + 1e-5) </br>
Compares time spent on productivity apps vs. gaming.

In [9]:
# Productivity to Entertainment Ratio
df["Productivity_to_Entertainment"] = df["Productivity_App_Usage_Hours"] / (df["Gaming_App_Usage_Hours"] + 1e-5)

4. Usage Intensity Per App

Total_App_Usage_Hours / Number_of_Apps_Used</br>
Shows the average time spent per app.

In [10]:
# Usage Intensity Per App
df["Usage_Intensity_Per_App"] = df["Total_App_Usage_Hours"] / df["Number_of_Apps_Used"]

5. Age Group Categorization

Converts age into: (18-25, 26-40, 41-60, 60+). </br>
Different age groups may have varying screen time behaviors.

In [11]:
# Age Group Categorization
age_bins = [18, 25, 40, 60, 100]  # Age groups
age_labels = ["18-25", "26-40", "41-60", "60+"]
df["Age_Group"] = pd.cut(df["Age"], bins=age_bins, labels=age_labels, right=False)

In [12]:
df.head()

Unnamed: 0,User_ID,Age,Gender,Total_App_Usage_Hours,Daily_Screen_Time_Hours,Number_of_Apps_Used,Social_Media_Usage_Hours,Productivity_App_Usage_Hours,Gaming_App_Usage_Hours,Location,App_Diversity_Score,Social_Media_Proportion,Productivity_to_Entertainment,Usage_Intensity_Per_App,Age_Group
0,1.0,56.0,male,2.61,7.15,24.0,4.43,0.55,2.4,los angeles,9.195402,1.697318,0.229166,0.10875,41-60
1,2.0,46.0,male,2.13,13.79,18.0,4.67,4.42,2.43,chicago,8.450704,2.192488,1.818923,0.118333,41-60
2,3.0,32.0,female,7.28,4.5,11.0,4.58,1.71,2.83,houston,1.510989,0.629121,0.604238,0.661818,26-40
3,8.0,40.0,male,9.53,8.85,11.0,2.66,0.28,2.93,chicago,1.15425,0.279119,0.095563,0.866364,41-60
4,10.0,28.0,male,1.8,4.11,16.0,2.14,1.2,0.96,new york,8.888889,1.188889,1.249987,0.1125,26-40


In [14]:
# target variable: High Screen Time (1 if above median, else 0)
median_screen_time = df["Daily_Screen_Time_Hours"].median()
df["High_Screen_Time"] = (df["Daily_Screen_Time_Hours"] > median_screen_time).astype(int)

In [15]:
# selecting features for modelling
features = ["App_Diversity_Score", "Social_Media_Proportion", "Productivity_to_Entertainment",
            "Usage_Intensity_Per_App", "Age_Group"]

In [16]:
# Encode categorical variables
from sklearn.preprocessing import LabelEncoder
df["Age_Group"] = LabelEncoder().fit_transform(df["Age_Group"])

In [17]:
# Split data
X = df[features]
y = df["High_Screen_Time"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Train RandomForest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [19]:
# Predictions
y_pred = rf_model.predict(X_test)

In [23]:
# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report

(0.5,
 '              precision    recall  f1-score   support\n\n           0       0.50      0.50      0.50        62\n           1       0.50      0.50      0.50        62\n\n    accuracy                           0.50       124\n   macro avg       0.50      0.50      0.50       124\nweighted avg       0.50      0.50      0.50       124\n')

# Feature Evaluation:

- Examining Feature Importance from RandomForest

I'll examine the feature importance scores to see which features contributed the most.

- Mutual Information Score

I'll compute the mutual information between each feature and the target variable to see how much information they provide about the classification.

In [24]:
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import mutual_info_classif

# Feature Importance from RandomForest
feature_importances = rf_model.feature_importances_

In [26]:
# Create a DataFrame for better visualization
import pandas as pd
feature_eval_df = pd.DataFrame({
    "Feature": features,
    "RandomForest_Importance": feature_importances
}).sort_values(by="RandomForest_Importance", ascending=False)

print(feature_eval_df)

                         Feature  RandomForest_Importance
2  Productivity_to_Entertainment                 0.255387
1        Social_Media_Proportion                 0.248119
0            App_Diversity_Score                 0.227410
3        Usage_Intensity_Per_App                 0.217934
4                      Age_Group                 0.051151


In [28]:
# Mutual Information Scores
mi_scores = mutual_info_classif(X, y, discrete_features='auto')

# Add Mutual Information scores to the DataFrame
feature_eval_df["Mutual_Information"] = mi_scores

print(feature_eval_df)


                         Feature  RandomForest_Importance  Mutual_Information
2  Productivity_to_Entertainment                 0.255387            0.000000
1        Social_Media_Proportion                 0.248119            0.016929
0            App_Diversity_Score                 0.227410            0.008143
3        Usage_Intensity_Per_App                 0.217934            0.000000
4                      Age_Group                 0.051151            0.002081


# Takeaways:

- Social Media Proportion is the best predictor (high importance + nonzero MI).
- App Diversity Score has some predictive power but is weaker.
- Age Group & Usage Intensity Per App have very little impact.
- Productivity_to_Entertainment is highly used by the model but does not add unique information, shwoing redundancy.

# Overall Summary

- Analytical Question:
"Can we predict whether a user has high or low mobile screen time based on their app usage behavior?"
I defined "high" screen time as users with daily screen time above the median and "low" as those below it.

**Features Created**

- App Diversity Score :Measures how spread out app usage is across different apps.
- Social Media Proportion :The fraction of total app usage spent on social media.
- Productivity to Entertainment Ratio :Compares productivity app usage against gaming.
- Usage Intensity Per App :Average time spent per app.
- Age Group :Users categorized into age brackets (18-25, 26-40, 41-60, 60+).

**Model Performance**
I trained a RandomForest Classifier using the new features, but the model achieved only 50% accuracy, which is no better than random guessing.


# Explaination:

Explanation of Feature Evaluation Results
1. RandomForest Feature Importance
The RandomForest_Importance column shows how much each feature contributed to the model's decision-making. Higher values indicate stronger influence on predicting high vs. low screen time.

Productivity_to_Entertainment (0.255) ----> Most important </br>
Social_Media_Proportion (0.248) ----> Second most important </br>
App_Diversity_Score (0.227) ----> Third </br>
Usage_Intensity_Per_App (0.218) ----> Fourth </br>
Age_Group (0.051) ----> Least important </br>

This shows that how users balance productivity vs. entertainment apps and their social media usage proportion are the strongest predictors of screen time.

2. Mutual Information (MI) Score
Mutual Information measures how much unique information a feature provides about the target (screen time classification). A score of 0.0 means no significant relationship, while higher values suggest a stronger connection.

Social_Media_Proportion (0.0169) ----> Provides the most information about screen time </br>
App_Diversity_Score (0.0081) ----? Provides some information </br>
Age_Group (0.0021) ----> Very weak relationship </br>
Productivity_to_Entertainment (0.0000) ----> No significant relationship </br>
Usage_Intensity_Per_App (0.0000) ----> No significant relationship </br>
</br>

**Although Productivity_to_Entertainment was the most important feature for RandomForest, its MI score is 0.0, meaning it doesn’t provide unique information but might be correlated with other features the model used.**