In [1]:
import joblib

# Load the models
logistic_regression_model = joblib.load("../models/logistic_regression.pkl")
random_forest_model = joblib.load("../models/random_forest.pkl")
xgb_model = joblib.load("../models/xgboost.pkl")

print("Models loaded successfully!")

Models loaded successfully!


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the feature dataset
features = pd.read_csv("../data/features.csv")

# Drop non-contributory columns
X = features.drop(columns=["user_id", "content_id", "timestamp", "event"])  # Adjust if necessary
y = features["event"]

# Encode target variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

# Train-test split (same settings as Notebook 4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train-test split successful!")

Train-test split successful!


In [4]:
print(X_train.isnull().sum())  # Check missing values per column
print(X_test.isnull().sum())

hour_of_day                      0
day_of_week                      0
is_weekend                       0
user_event_count                 0
unique_content_count             0
avg_time_between_interactions    1
content_interaction_count        0
content_share_ratio              0
session_id                       0
session_length                   0
dtype: int64
hour_of_day                      0
day_of_week                      0
is_weekend                       0
user_event_count                 0
unique_content_count             0
avg_time_between_interactions    1
content_interaction_count        0
content_share_ratio              0
session_id                       0
session_length                   0
dtype: int64


In [5]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")  # Use "median" if needed
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [7]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)  # Ensure predictions are generated
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n" + "="*50 + "\n")

In [8]:
print("Logistic Regression Performance:")
evaluate_model(logistic_regression_model, X_test, y_test)

print("Random Forest Performance:")
evaluate_model(random_forest_model, X_test, y_test)

print("XGBoost Performance:")
evaluate_model(xgb_model, X_test, y_test)

Logistic Regression Performance:
Accuracy: 0.8888888888888888
Classification Report:
               precision    recall  f1-score   support

           0       0.89      1.00      0.94        94
           1       1.00      0.14      0.25        14

    accuracy                           0.89       108
   macro avg       0.94      0.57      0.59       108
weighted avg       0.90      0.89      0.85       108

Confusion Matrix:
 [[94  0]
 [12  2]]


Random Forest Performance:
Accuracy: 0.8796296296296297
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.97      0.93        94
           1       0.57      0.29      0.38        14

    accuracy                           0.88       108
   macro avg       0.74      0.63      0.66       108
weighted avg       0.86      0.88      0.86       108

Confusion Matrix:
 [[91  3]
 [10  4]]


XGBoost Performance:
Accuracy: 0.8518518518518519
Classification Report:
               precision   

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define hyperparameters to tune
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

# Set up GridSearch
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print best params & best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Save the best model
import joblib
joblib.dump(grid_search.best_estimator_, "../models/logistic_regression_tuned.pkl")

Best Parameters: {'C': 10, 'solver': 'liblinear'}
Best Accuracy: 0.8835020519835842


['../models/logistic_regression_tuned.pkl']