In [1]:
# ============================================
# KNN Classification
# Dataset: Smartphone Usage Productivity
# ============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
data = pd.read_csv("Smartphone_Usage_Productivity_Dataset_50000.csv")

print("Dataset Shape:", data.shape)
data.head()

Dataset Shape: (50000, 13)


Unnamed: 0,User_ID,Age,Gender,Occupation,Device_Type,Daily_Phone_Hours,Social_Media_Hours,Work_Productivity_Score,Sleep_Hours,Stress_Level,App_Usage_Count,Caffeine_Intake_Cups,Weekend_Screen_Time_Hours
0,U1,58,Male,Professional,Android,1.3,6.7,6,8.8,4,42,1,8.7
1,U2,25,Male,Professional,Android,1.2,1.5,5,6.4,1,51,3,5.1
2,U3,19,Male,Student,iOS,5.3,5.7,5,9.0,4,14,5,6.3
3,U4,35,Female,Business Owner,iOS,5.8,2.5,2,5.7,3,36,6,12.8
4,U5,33,Male,Freelancer,Android,7.9,1.3,4,5.7,3,37,5,9.9


In [3]:
data.isnull().sum()

Unnamed: 0,0
User_ID,0
Age,0
Gender,0
Occupation,0
Device_Type,0
Daily_Phone_Hours,0
Social_Media_Hours,0
Work_Productivity_Score,0
Sleep_Hours,0
Stress_Level,0


In [6]:
data = pd.read_csv("Smartphone_Usage_Productivity_Dataset_50000.csv")

print(data.head())

  User_ID  Age  Gender      Occupation Device_Type  Daily_Phone_Hours  \
0      U1   58    Male    Professional     Android                1.3   
1      U2   25    Male    Professional     Android                1.2   
2      U3   19    Male         Student         iOS                5.3   
3      U4   35  Female  Business Owner         iOS                5.8   
4      U5   33    Male      Freelancer     Android                7.9   

   Social_Media_Hours  Work_Productivity_Score  Sleep_Hours  Stress_Level  \
0                 6.7                        6          8.8             4   
1                 1.5                        5          6.4             1   
2                 5.7                        5          9.0             4   
3                 2.5                        2          5.7             3   
4                 1.3                        4          5.7             3   

   App_Usage_Count  Caffeine_Intake_Cups  Weekend_Screen_Time_Hours  
0               42          

In [8]:
# Create 3 classes based on score

data["Productivity_Level"] = pd.cut(
    data["Work_Productivity_Score"],
    bins=3,
    labels=[0, 1, 2]   # 0 = Low, 1 = Medium, 2 = High
)

data.head()

Unnamed: 0,User_ID,Age,Gender,Occupation,Device_Type,Daily_Phone_Hours,Social_Media_Hours,Work_Productivity_Score,Sleep_Hours,Stress_Level,App_Usage_Count,Caffeine_Intake_Cups,Weekend_Screen_Time_Hours,Productivity_Level
0,U1,58,Male,Professional,Android,1.3,6.7,6,8.8,4,42,1,8.7,1
1,U2,25,Male,Professional,Android,1.2,1.5,5,6.4,1,51,3,5.1,1
2,U3,19,Male,Student,iOS,5.3,5.7,5,9.0,4,14,5,6.3,1
3,U4,35,Female,Business Owner,iOS,5.8,2.5,2,5.7,3,36,6,12.8,0
4,U5,33,Male,Freelancer,Android,7.9,1.3,4,5.7,3,37,5,9.9,0


In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = le.fit_transform(data[col])

In [12]:
X = data.drop("Productivity_Level", axis=1)
y = data["Productivity_Level"]

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.3597

Confusion Matrix:
 [[2329  998  659]
 [1755  781  480]
 [1748  763  487]]

Classification Report:
               precision    recall  f1-score   support

           0       0.40      0.58      0.47      3986
           1       0.31      0.26      0.28      3016
           2       0.30      0.16      0.21      2998

    accuracy                           0.36     10000
   macro avg       0.34      0.34      0.32     10000
weighted avg       0.34      0.36      0.34     10000

