In [1]:
import os
import cv2
import numpy as np
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from ultralytics import YOLO

# --- 1. Data Loading and Feature Extraction ---

dataset_dir = 'Dataset'  # Replace with your dataset path
img_size = (128, 128) 

# Load YOLOv8 model 
model_yolo = YOLO('yolov8m.pt') 

def load_data(dir_path):
    """Loads images and labels from directory structure, cropping people with YOLO."""
    images = []
    labels = []
    class_names = sorted(os.listdir(dir_path))
    for i, class_name in enumerate(class_names):
        class_dir = os.path.join(dir_path, class_name)
        for img_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, img_name)

            # Load image
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 

            # Object Detection with YOLOv8
            results = model_yolo(img) 

            # Get bounding box coordinates of the person
            x1, y1, x2, y2 = 0, 0, 0, 0 # Initialize coordinates 
            for r in results:
                boxes = r.boxes.xyxy.tolist()  # Bounding boxes
                classes = r.boxes.cls.tolist()  # Class indices
                for box, cls in zip(boxes, classes):
                    if cls == 0:  # Assuming 'person' is class 0 
                        x1, y1, x2, y2 = map(int, box) 
                        break  # Assuming only one person per image 

            # Square Cropping
            if x1 == 0 and y1 == 0 and x2 == 0 and y2 == 0:
                print(f"Warning: Person not detected in {img_path}. Skipping image.")
                continue  # Skip to the next image

            center_x = (x1 + x2) // 2
            center_y = (y1 + y2) // 2
            crop_size = max(x2 - x1, y2 - y1)  # Size of the larger side

            crop_x1 = max(0, center_x - crop_size // 2)
            crop_y1 = max(0, center_y - crop_size // 2)
            crop_x2 = min(img.shape[1], crop_x1 + crop_size)
            crop_y2 = min(img.shape[0], crop_y1 + crop_size)

            cropped_img = img[crop_y1:crop_y2, crop_x1:crop_x2]

            # Resize cropped image to desired size
            cropped_img = cv2.resize(cropped_img, img_size)

            # --- Preprocessing for VGG16 ---
            cropped_img = image.img_to_array(cropped_img) # Convert to array
            cropped_img = preprocess_input(cropped_img) 
            images.append(cropped_img)
            labels.append(i) 
        print(f"{class_name} folder is finished \n---------------------------------------------------------------------------------------------------------------------------------------\n---------------------------------------------------------------------------------------------------------------------------------------")
    
    return np.array(images), np.array(labels), class_names

In [2]:
# Load data 
X, y, class_names = load_data(dataset_dir)
print(f"Number of images {len(X)}")
print(f"Number of labels {len(y)}")



0: 384x640 1 person, 1 vase, 796.1ms
Speed: 9.1ms preprocess, 796.1ms inference, 1519.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 867.0ms
Speed: 8.0ms preprocess, 867.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 903.9ms
Speed: 2.0ms preprocess, 903.9ms inference, 2.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 889.4ms
Speed: 2.0ms preprocess, 889.4ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 817.5ms
Speed: 2.1ms preprocess, 817.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 873.1ms
Speed: 2.0ms preprocess, 873.1ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 886.8ms
Speed: 2.1ms preprocess, 886.8ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 vase, 856.3ms
Speed: 

In [3]:
# Load VGG16 model (without top)
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(img_size[0], img_size[1], 3)) 

# Extract features
features = base_model.predict(X) 
features = features.reshape(features.shape[0], -1) 

[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 581ms/step


In [4]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42)
X_balanced, y_balanced = smote_enn.fit_resample(features, y)

In [5]:
# --- 2. Data Splitting and Scaling ---

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, shuffle=True, random_state=42)

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [6]:
print(f"{X_train.shape}, {X_test.shape}")

(16004, 8192), (4002, 8192)


In [7]:
# --- 3. Model Training (SVM) ---

model = SVC(kernel='rbf', C=1.0, random_state=42) 
model.fit(X_train, y_train)

# --- 4. Model Evaluation ---

y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=class_names)) 
print(confusion_matrix(y_test, y_pred)) 

Accuracy: 1.0
                  precision    recall  f1-score   support

      distracted       1.00      1.00      1.00       545
         fatigue       1.00      1.00      1.00       580
         focused       1.00      1.00      1.00       587
      raise_hand       1.00      1.00      1.00       577
        sleeping       1.00      1.00      1.00       574
using_smartphone       1.00      1.00      1.00       604
 writing_reading       1.00      1.00      1.00       535

        accuracy                           1.00      4002
       macro avg       1.00      1.00      1.00      4002
    weighted avg       1.00      1.00      1.00      4002

[[545   0   0   0   0   0   0]
 [  0 580   0   0   0   0   0]
 [  0   0 587   0   0   0   0]
 [  0   0   0 577   0   0   0]
 [  0   0   0   0 574   0   0]
 [  0   0   0   0   0 604   0]
 [  0   0   0   0   0   0 535]]


In [8]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)  # Choose a value for k
knn_model.fit(X_train, y_train) 

# ... (Evaluate the model)
y_pred_knn = knn_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn, target_names=class_names)) 
print(confusion_matrix(y_test, y_pred_knn)) 

Accuracy: 1.0
                  precision    recall  f1-score   support

      distracted       1.00      1.00      1.00       545
         fatigue       1.00      1.00      1.00       580
         focused       1.00      1.00      1.00       587
      raise_hand       1.00      1.00      1.00       577
        sleeping       1.00      1.00      1.00       574
using_smartphone       1.00      1.00      1.00       604
 writing_reading       1.00      1.00      1.00       535

        accuracy                           1.00      4002
       macro avg       1.00      1.00      1.00      4002
    weighted avg       1.00      1.00      1.00      4002

[[545   0   0   0   0   0   0]
 [  0 580   0   0   0   0   0]
 [  0   0 587   0   0   0   0]
 [  0   0   0 577   0   0   0]
 [  0   0   0   0 574   0   0]
 [  0   0   0   0   0 604   0]
 [  0   0   0   0   0   0 535]]


In [9]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
# ... (Evaluate the model)
y_pred_dt = dt_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt, target_names=class_names)) 
print(confusion_matrix(y_test, y_pred_dt)) 

Accuracy: 0.9955022488755623
                  precision    recall  f1-score   support

      distracted       0.99      1.00      0.99       545
         fatigue       1.00      1.00      1.00       580
         focused       1.00      0.99      0.99       587
      raise_hand       0.99      1.00      1.00       577
        sleeping       1.00      0.99      1.00       574
using_smartphone       1.00      1.00      1.00       604
 writing_reading       0.99      1.00      1.00       535

        accuracy                           1.00      4002
       macro avg       1.00      1.00      1.00      4002
    weighted avg       1.00      1.00      1.00      4002

[[543   0   0   0   0   2   0]
 [  0 579   0   0   0   0   1]
 [  0   0 579   3   2   1   2]
 [  0   0   1 575   0   0   1]
 [  2   0   1   0 571   0   0]
 [  2   0   0   0   0 602   0]
 [  0   0   0   0   0   0 535]]


In [10]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # Adjust n_estimators as needed
rf_model.fit(X_train, y_train)
# ... (Evaluate the model)
y_pred_rf = rf_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=class_names)) 
print(confusion_matrix(y_test, y_pred_rf)) 

Accuracy: 1.0
                  precision    recall  f1-score   support

      distracted       1.00      1.00      1.00       545
         fatigue       1.00      1.00      1.00       580
         focused       1.00      1.00      1.00       587
      raise_hand       1.00      1.00      1.00       577
        sleeping       1.00      1.00      1.00       574
using_smartphone       1.00      1.00      1.00       604
 writing_reading       1.00      1.00      1.00       535

        accuracy                           1.00      4002
       macro avg       1.00      1.00      1.00      4002
    weighted avg       1.00      1.00      1.00      4002

[[545   0   0   0   0   0   0]
 [  0 580   0   0   0   0   0]
 [  0   0 587   0   0   0   0]
 [  0   0   0 577   0   0   0]
 [  0   0   0   0 574   0   0]
 [  0   0   0   0   0 604   0]
 [  0   0   0   0   0   0 535]]


In [11]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(random_state=42) 
lr_model.fit(X_train, y_train)
# ... (Evaluate the model)
y_pred_lr = lr_model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, target_names=class_names)) 
print(confusion_matrix(y_test, y_pred_lr)) 

Accuracy: 1.0
                  precision    recall  f1-score   support

      distracted       1.00      1.00      1.00       545
         fatigue       1.00      1.00      1.00       580
         focused       1.00      1.00      1.00       587
      raise_hand       1.00      1.00      1.00       577
        sleeping       1.00      1.00      1.00       574
using_smartphone       1.00      1.00      1.00       604
 writing_reading       1.00      1.00      1.00       535

        accuracy                           1.00      4002
       macro avg       1.00      1.00      1.00      4002
    weighted avg       1.00      1.00      1.00      4002

[[545   0   0   0   0   0   0]
 [  0 580   0   0   0   0   0]
 [  0   0 587   0   0   0   0]
 [  0   0   0 577   0   0   0]
 [  0   0   0   0 574   0   0]
 [  0   0   0   0   0 604   0]
 [  0   0   0   0   0   0 535]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
# --- 5. Prediction on a New Image ---

def predict_image(image_path, model, class_names):
    """Predicts the class of a single image."""
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 

    # Object Detection with YOLOv8
    results = model_yolo(img) 

     # Get bounding box coordinates of the person
    x1, y1, x2, y2 = 0, 0, 0, 0 # Initialize coordinates 
    for r in results:
        boxes = r.boxes.xyxy.tolist()  # Bounding boxes
        classes = r.boxes.cls.tolist()  # Class indices
        for box, cls in zip(boxes, classes):
            if cls == 0:  # Assuming 'person' is class 0 
                x1, y1, x2, y2 = map(int, box) 
                break  # Assuming only one person per image 

    # Square Cropping 
    center_x = (x1 + x2) // 2
    center_y = (y1 + y2) // 2
    crop_size = max(x2 - x1, y2 - y1)  # Size of the larger side

    crop_x1 = max(0, center_x - crop_size // 2)
    crop_y1 = max(0, center_y - crop_size // 2)
    crop_x2 = min(img.shape[1], crop_x1 + crop_size)
    crop_y2 = min(img.shape[0], crop_y1 + crop_size)

    cropped_img = img[crop_y1:crop_y2, crop_x1:crop_x2]

    # Resize cropped image to desired size
    cropped_img = cv2.resize(cropped_img, img_size)
    # Preprocess for VGG16
    cropped_img = image.img_to_array(cropped_img) 
    cropped_img = preprocess_input(cropped_img)
    cropped_img = np.expand_dims(cropped_img, axis=0) 

    features = base_model.predict(cropped_img)
    features = features.reshape(features.shape[0], -1) 
    # features = scaler.transform(features)

    prediction = model.predict(features)[0]  
    predicted_class = class_names[prediction] 

    return predicted_class

new_image_path = 'uploads/away.jpg' # Replace with your image path
predicted_class = predict_image(new_image_path, lr_model, class_names)
print("Predicted Class:", predicted_class)




0: 384x640 1 person, 825.2ms
Speed: 3.0ms preprocess, 825.2ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
Predicted Class: distracted


In [33]:
import joblib
joblib.dump(model, "svm_3.pkl") # kinda good
joblib.dump(knn_model, "knn_3.pkl") #dumb
joblib.dump(dt_model, "dt_3.pkl") # Very stupid
joblib.dump(lr_model, "lr_3.pkl") # good to be fair
joblib.dump(rf_model, "rf_3.pkl")

['rf_3.pkl']