In [1]:
import pandas as pd

# Load the feature dataset
features = pd.read_csv("../data/features.csv")

# Display basic info
print(features.info())
print(features.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537 entries, 0 to 536
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   user_id                        537 non-null    int64  
 1   event                          537 non-null    object 
 2   content_id                     537 non-null    object 
 3   timestamp                      537 non-null    object 
 4   hour_of_day                    537 non-null    int64  
 5   day_of_week                    537 non-null    int64  
 6   is_weekend                     537 non-null    int64  
 7   user_event_count               537 non-null    int64  
 8   unique_content_count           537 non-null    int64  
 9   avg_time_between_interactions  535 non-null    float64
 10  content_interaction_count      537 non-null    int64  
 11  content_share_ratio            537 non-null    float64
 12  session_id                     537 non-null    int

In [2]:
from sklearn.preprocessing import LabelEncoder

# Encode event (target variable)
le = LabelEncoder()
features["event"] = le.fit_transform(features["event"])  # 0 = 'like', 1 = 'share'

In [3]:
# Drop only non-contributory columns but keep relevant features
X = features.drop(columns=["user_id", "content_id", "timestamp"])  # Keep engineered features
y = features["event"]  # Target variable remains the same

In [4]:
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
}

In [6]:
features = features.dropna()

In [7]:
# Dictionary to store results
results = {}

for name, model in models.items():
    print(f"Training {name}...")

    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}\n")
    
    # Store results
    results[name] = {
        "accuracy": accuracy,
        "classification_report": classification_report(y_test, y_pred)
    }


Training Logistic Regression...


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [8]:
print(features.isnull().sum())

user_id                          0
event                            0
content_id                       0
timestamp                        0
hour_of_day                      0
day_of_week                      0
is_weekend                       0
user_event_count                 0
unique_content_count             0
avg_time_between_interactions    0
content_interaction_count        0
content_share_ratio              0
session_id                       0
session_length                   0
dtype: int64


In [9]:
features = pd.get_dummies(features, columns=['event'], drop_first=True)

In [10]:
features = features.drop(['user_id', 'timestamp'], axis=1)

In [11]:
# Dictionary to store results
results = {}

for name, model in models.items():
    print(f"Training {name}...")

    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}\n")
    
    # Store results
    results[name] = {
        "accuracy": accuracy,
        "classification_report": classification_report(y_test, y_pred)
    }


Training Logistic Regression...


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [16]:
features.fillna(0, inplace=True)  # Replace NaN with 0

In [17]:
print(features.isnull().sum())  # Ensure no missing values exist

content_id                       0
hour_of_day                      0
day_of_week                      0
is_weekend                       0
user_event_count                 0
unique_content_count             0
avg_time_between_interactions    0
content_interaction_count        0
content_share_ratio              0
session_id                       0
session_length                   0
event_1                          0
dtype: int64


In [18]:
model.fit(X_train, y_train)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values