In [1]:
import pandas as pd

# Load the feature dataset
features = pd.read_csv("../data/features.csv")

# Check for missing values
print(features.isnull().sum())

# Display dataset information
print(features.info())
print(features.head())

user_id                          0
event                            0
content_id                       0
timestamp                        0
hour_of_day                      0
day_of_week                      0
is_weekend                       0
user_event_count                 0
unique_content_count             0
avg_time_between_interactions    2
content_interaction_count        0
content_share_ratio              0
session_id                       0
session_length                   0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537 entries, 0 to 536
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   user_id                        537 non-null    int64  
 1   event                          537 non-null    object 
 2   content_id                     537 non-null    object 
 3   timestamp                      537 non-null    object 
 4   hour_of_day                

In [2]:
features.dropna(inplace=True)
print(features.isnull().sum())  # Should all be 0 now

user_id                          0
event                            0
content_id                       0
timestamp                        0
hour_of_day                      0
day_of_week                      0
is_weekend                       0
user_event_count                 0
unique_content_count             0
avg_time_between_interactions    0
content_interaction_count        0
content_share_ratio              0
session_id                       0
session_length                   0
dtype: int64


In [3]:
# Drop non-contributory columns
X = features.drop(columns=["user_id", "content_id", "timestamp", "event"])  # Keep relevant numerical features
y = features["event"]  # Target variable

# Check data shapes
print(X.shape, y.shape)

(535, 10) (535,)


In [4]:
from sklearn.model_selection import train_test_split

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Verify distribution
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

event
like     0.871495
share    0.128505
Name: proportion, dtype: float64
event
like     0.869159
share    0.130841
Name: proportion, dtype: float64


In [7]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Initialize and fit LabelEncoder
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)  # Ensure the encoding is stored in a new variable

# Confirm encoding
print("Class Mapping:", dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
print("Encoded y values:", np.unique(y_encoded))  # Check if we have only [0,1]

# Replace y with the encoded version
y = y_encoded

Class Mapping: {np.int64(0): np.int64(0), np.int64(1): np.int64(1)}
Encoded y values: [0 1]


In [8]:
y = y.astype(int)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Verify that y_train and y_test contain only numerical values
print("y_train unique values:", np.unique(y_train))
print("y_test unique values:", np.unique(y_test))

y_train unique values: [0 1]
y_test unique values: [0 1]


In [10]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"{name} trained successfully!")

Logistic Regression trained successfully!
Random Forest trained successfully!
XGBoost trained successfully!


In [12]:
import joblib

# Loop through the models dictionary and save each model
for name, model in models.items():
    joblib.dump(model, f"../models/{name.lower().replace(' ', '_')}.pkl")

print("Models saved successfully!")

Models saved successfully!
