# Testing all models with cleaned data (imbalanced)

## Imports required:

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error,r2_score,accuracy_score, confusion_matrix,f1_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import SVR,SVC
from imblearn.over_sampling import SMOTE 
from sklearn.neighbors import KNeighborsClassifier ,KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor, StackingClassifier
from xgboost import XGBClassifier


## Regression models:

### Data split

In [39]:
df= pd.read_csv("cleaned_data.csv")

X = df.drop(columns=['popularity'])  # Replace 'target_column' with your actual target variable
y = df['popularity']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.squeeze()
y_test = y_test.squeeze()


### Linear regression:

In [40]:
# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error & R-squared Score
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test, y_test)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Tolerance-Based Accuracy Calculation
tolerance = 10  # Absolute error margin

# Count correct predictions within tolerance
correct_predictions = np.sum(np.abs(y_test - y_pred) <= tolerance)

# Calculate accuracy
accuracy_with_tolerance = (correct_predictions / len(y_test)) * 100

print(f"Model Accuracy (with ±{tolerance}% tolerance): {accuracy_with_tolerance:.2f}%")


Mean Squared Error: 401.3028
R-squared Score: 0.0179
Model Accuracy (with ±10% tolerance): 38.83%


### Polynomial Regression:

In [41]:
# Set polynomial degree
degree = 3  # You can experiment with higher degrees

# Transform features into polynomial features
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Make predictions
y_pred = model.predict(X_test_poly)

# Calculate Mean Squared Error & R-squared Score
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test_poly, y_test)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Tolerance-Based Accuracy Calculation
tolerance = 10  # Absolute error margin
correct_predictions = np.sum(np.abs(y_test - y_pred) <= tolerance)
accuracy_with_tolerance = (correct_predictions / len(y_test)) * 100

print(f"Model Accuracy (with ±{tolerance}% tolerance): {accuracy_with_tolerance:.2f}%")

Mean Squared Error: 393.4078
R-squared Score: 0.0372
Model Accuracy (with ±10% tolerance): 39.66%


### KNN Regressor:

In [42]:
# Initialize the model (choose k=5 as a starting point)
model = KNeighborsRegressor(n_neighbors=5)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error & R-squared Score
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test, y_test)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Tolerance-Based Accuracy Calculation (±10 margin)
tolerance = 10  # Absolute error margin

# Count correct predictions within tolerance
correct_predictions = np.sum(np.abs(y_test - y_pred) <= tolerance)

# Calculate tolerance-based accuracy
accuracy_with_tolerance = (correct_predictions / len(y_test)) * 100
print(f"Model Accuracy (with ±{tolerance} margin): {accuracy_with_tolerance:.2f}%")


Mean Squared Error: 466.0415
R-squared Score: -0.1406
Model Accuracy (with ±10 margin): 36.85%


### Random Forest Regression:

In [43]:
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error & R-squared Score
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test, y_test)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Tolerance-Based Accuracy Calculation (±10 margin)
tolerance = 10  # Absolute error margin

# Count correct predictions within tolerance
correct_predictions = np.sum(np.abs(y_test - y_pred) <= tolerance)

# Calculate tolerance-based accuracy
accuracy_with_tolerance = (correct_predictions / len(y_test)) * 100
print(f"Model Accuracy (with ±{tolerance} margin): {accuracy_with_tolerance:.2f}%")


Mean Squared Error: 410.3948
R-squared Score: -0.0044
Model Accuracy (with ±10 margin): 38.56%


### SVR

In [44]:
# Standardize features and target
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Reshape y to 2D before scaling, then back to 1D
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).ravel()

# Initialize the model with RBF kernel (default)
model = SVR(kernel='rbf')

# Train the model
model.fit(X_train_scaled, y_train_scaled)
y_pred_scaled = model.predict(X_test_scaled)

# Convert predictions back to original scale
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

# Calculate Mean Squared Error & R-squared Score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Tolerance-Based Accuracy Calculation (±10 margin)
tolerance = 10  # Absolute error margin

# Count correct predictions within tolerance
correct_predictions = np.sum(np.abs(y_test - y_pred) <= tolerance)

# Calculate tolerance-based accuracy
accuracy_with_tolerance = (correct_predictions / len(y_test)) * 100
print(f"Model Accuracy (with ±{tolerance} margin): {accuracy_with_tolerance:.2f}%")


Mean Squared Error: 405.9112
R-squared Score: 0.0066
Model Accuracy (with ±10 margin): 43.08%


## Classification models:

### Logistical Regression (without SMOTE)

In [7]:
df = pd.read_csv("cleaned_data.csv")
# Loop over each threshold
# Define thresholds to test
thresholds = [50, 60, 70]
for threshold in thresholds:
    print("="*40)
    print(f"Testing Threshold: {threshold}")
    
    # Create a fresh copy of the data each time
    data = df.copy()

    # Convert popularity to binary based on current threshold
    data['popularity'] = (data['popularity'] > threshold).astype(int)

    # Define features and target
    X = data.drop(columns=['popularity'])
    y = data['popularity']

    # Train-test split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train Logistic Regression model
    model = LogisticRegression()
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred) * 100
    print(f"Model Accuracy: {accuracy:.2f}%")
    f1 = f1_score(y_test, y_pred)
    print("F1 Score:", f1)
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

Testing Threshold: 50
Model Accuracy: 57.65%
F1 Score: 0.6594827586206896
Confusion Matrix:
[[ 497  916]
 [ 348 1224]]
Testing Threshold: 60
Model Accuracy: 68.91%
F1 Score: 0.0
Confusion Matrix:
[[2057    0]
 [ 928    0]]
Testing Threshold: 70
Model Accuracy: 87.24%
F1 Score: 0.0
Confusion Matrix:
[[2604    0]
 [ 381    0]]


### Logistical regression with SMOTE

In [8]:
df = pd.read_csv("cleaned_data.csv")
threshold = 70
print("=" * 50)
print(f"Testing Threshold: {threshold}")

# Convert popularity to binary
df['popularity'] = (df['popularity'] > threshold).astype(int)

# Define features and target
X = df.drop(columns=['popularity'])
y = df['popularity']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE to training data
smote = SMOTE(random_state=42, sampling_strategy=0.5)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train_resampled)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Model Accuracy: {accuracy:.2f}%")
f1 = f1_score(y_test, y_pred)

print("F1 Score:", f1)
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Testing Threshold: 70
Model Accuracy: 84.99%
F1 Score: 0.13513513513513514
Confusion Matrix:
[[2502  102]
 [ 346   35]]


### KNN 

In [9]:
df = pd.read_csv("cleaned_data.csv")
thresholds = [50, 60, 70]

# Number of neighbors for KNN
k = 5

for threshold in thresholds:
    print("=" * 50)
    print(f"Testing Threshold: {threshold}")

    # Copy the original data
    data = df.copy()

    # Binarize target based on threshold
    data['popularity'] = (data['popularity'] > threshold).astype(int)

    # Define features and target
    X = data.drop(columns=['popularity'])
    y = data['popularity']

    # Train-test split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train KNN
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)

    # Predict
    y_pred = knn.predict(X_test_scaled)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred) * 100
    print(f"KNN Model with k={k}")
    print(f"Model Accuracy: {accuracy:.2f}%")
    f1 = f1_score(y_test, y_pred)

    print("F1 Score:", f1)
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

Testing Threshold: 50
KNN Model with k=5
Model Accuracy: 55.34%
F1 Score: 0.5904761904761905
Confusion Matrix:
[[691 722]
 [611 961]]
Testing Threshold: 60
KNN Model with k=5
Model Accuracy: 63.12%
F1 Score: 0.30971786833855797
Confusion Matrix:
[[1637  420]
 [ 681  247]]
Testing Threshold: 70
KNN Model with k=5
Model Accuracy: 84.96%
F1 Score: 0.096579476861167
Confusion Matrix:
[[2512   92]
 [ 357   24]]


### RFC without smote

In [10]:
df = pd.read_csv("cleaned_data.csv")

# Thresholds to test
thresholds = [50,60, 70]

# Loop over thresholds
for threshold in thresholds:
    print("=" * 50)
    print(f"Testing Threshold: {threshold}")

    # Convert target variable: Popularity > threshold → 1, else 0
    data = df.copy()
    data['popularity'] = (data['popularity'] > threshold).astype(int)

    # Define features and target
    X = data.drop(columns=['popularity'])
    y = data['popularity']

    # Split data (80% train, 20% test) with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Feature scaling (optional for Random Forest, but kept for consistency)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialize and train Random Forest model
    rfc = RandomForestClassifier(
        n_estimators=500,
        random_state=42,
        class_weight="balanced",
        max_depth=20,
        min_samples_split=5
    )
    rfc.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = rfc.predict(X_test_scaled)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred) * 100
    print(f"Model Accuracy: {accuracy:.2f}%")
    f1 = f1_score(y_test, y_pred)

    print("F1 Score:", f1)
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)

Testing Threshold: 50
Model Accuracy: 58.22%
F1 Score: 0.627872276932259
Confusion Matrix:
 [[ 686  727]
 [ 520 1052]]
Testing Threshold: 60
Model Accuracy: 67.40%
F1 Score: 0.3412322274881517
Confusion Matrix:
 [[1760  297]
 [ 676  252]]
Testing Threshold: 70
Model Accuracy: 83.99%
F1 Score: 0.16140350877192983
Confusion Matrix:
 [[2461  143]
 [ 335   46]]


### RFC with SMOTE

In [11]:
df = pd.read_csv("cleaned_data.csv")

# Thresholds to test
thresholds = [60, 70]

# Loop over each threshold
for threshold in thresholds:
    print("=" * 50)
    print(f"Testing Threshold: {threshold}")

    # Convert target variable: Popularity > threshold → 1, else 0
    data = df.copy()
    data['popularity'] = (data['popularity'] > threshold).astype(int)

    # Define features and target
    X = data.drop(columns=['popularity'])
    y = data['popularity']

    # Split data (80% train, 20% test) with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Apply SMOTE to training data
    smote = SMOTE(sampling_strategy=0.5, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test)

    # Initialize and train Random Forest model
    rfc = RandomForestClassifier(
        n_estimators=500,
        random_state=42,
        class_weight="balanced",
        max_depth=20,
        min_samples_split=5
    )
    rfc.fit(X_train_scaled, y_train_resampled)

    # Make predictions
    y_pred = rfc.predict(X_test_scaled)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred) * 100
    print(f"Model Accuracy: {accuracy:.2f}%")
    f1 = f1_score(y_test, y_pred)

    print("F1 Score:", f1)
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)

Testing Threshold: 60
Model Accuracy: 66.40%
F1 Score: 0.3623649078194533
Confusion Matrix:
 [[1697  360]
 [ 643  285]]
Testing Threshold: 70
Model Accuracy: 74.30%
F1 Score: 0.25316455696202533
Confusion Matrix:
 [[2088  516]
 [ 251  130]]


### SVM without SMOTE

In [12]:
df = pd.read_csv("cleaned_data.csv")

# Define thresholds to loop over
thresholds = [50, 60, 70]

# Loop over thresholds
for threshold in thresholds:
    print("=" * 50)
    print(f"Testing Threshold: {threshold}")

    # Convert popularity to binary target
    data = df.copy()
    data['popularity'] = (data['popularity'] > threshold).astype(int)

    # Define features and target
    X = data.drop(columns=['popularity'])
    y = data['popularity']

    # Train-test split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialize and train SVM
    svm = SVC(kernel='rbf', C=1, probability=True, random_state=42)
    svm.fit(X_train_scaled, y_train)

    # Predictions
    y_pred = svm.predict(X_test_scaled)

    # Evaluation
    accuracy = accuracy_score(y_test, y_pred) * 100
    print(f"SVM Model Accuracy: {accuracy:.2f}%")
    f1 = f1_score(y_test, y_pred)

    print("F1 Score:", f1)
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)

Testing Threshold: 50
SVM Model Accuracy: 59.73%
F1 Score: 0.6828496042216359
Confusion Matrix:
 [[ 489  924]
 [ 278 1294]]
Testing Threshold: 60
SVM Model Accuracy: 68.84%
F1 Score: 0.0
Confusion Matrix:
 [[2055    2]
 [ 928    0]]
Testing Threshold: 70
SVM Model Accuracy: 87.24%
F1 Score: 0.0
Confusion Matrix:
 [[2604    0]
 [ 381    0]]


### SVM with smote

In [13]:
# Load dataset
df = pd.read_csv("cleaned_data.csv")

# Thresholds to evaluate
thresholds = [60, 70]

# Loop over each threshold
for threshold in thresholds:
    print("=" * 50)
    print(f"Testing Threshold: {threshold}")

    # Convert target to binary
    data = df.copy()
    data['popularity'] = (data['popularity'] > threshold).astype(int)

    # Define features and target
    X = data.drop(columns=['popularity'])
    y = data['popularity']

    # Train-test split with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Apply SMOTE to training data
    smote = SMOTE(sampling_strategy=0.5, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test)

    # Initialize and train SVM model
    svm = SVC(kernel='rbf', C=1, probability=True, random_state=42)
    svm.fit(X_train_scaled, y_train_resampled)

    # Make predictions
    y_pred = svm.predict(X_test_scaled)

    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred) * 100
    print(f"SVM Model Accuracy: {accuracy:.2f}%")
    f1 = f1_score(y_test, y_pred)

    print("F1 Score:", f1)
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)

Testing Threshold: 60
SVM Model Accuracy: 68.81%
F1 Score: 0.0
Confusion Matrix:
 [[2054    3]
 [ 928    0]]
Testing Threshold: 70
SVM Model Accuracy: 81.64%
F1 Score: 0.20348837209302326
Confusion Matrix:
 [[2367  237]
 [ 311   70]]


### XGB 

In [14]:
# Load dataset
df = pd.read_csv("cleaned_data.csv")

# Thresholds to test
thresholds = [50, 60, 70]

# Loop through thresholds
for threshold in thresholds:
    print("=" * 50)
    print(f"Testing Threshold: {threshold}")

    # Convert popularity to binary based on threshold
    data = df.copy()
    data['popularity'] = (data['popularity'] > threshold).astype(int)

    # Define features and target
    X = data.drop(columns=['popularity'])
    y = data['popularity']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Feature scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # XGBoost Classifier
    xgb = XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=8,
        random_state=42,
        scale_pos_weight=5  # You can also set this dynamically if needed
    )

    # Train model
    xgb.fit(X_train_scaled, y_train)

    # Predict
    y_pred = xgb.predict(X_test_scaled)

    # Evaluation
    accuracy = accuracy_score(y_test, y_pred) * 100
    print(f"XGBoost Model Accuracy: {accuracy:.2f}%")
    f1 = f1_score(y_test, y_pred)

    print("F1 Score:", f1)
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Testing Threshold: 50
XGBoost Model Accuracy: 55.88%
F1 Score: 0.6930785364716849
Confusion Matrix:
 [[ 181 1232]
 [  85 1487]]
Testing Threshold: 60
XGBoost Model Accuracy: 50.35%
F1 Score: 0.4889655172413793
Confusion Matrix:
 [[ 794 1263]
 [ 219  709]]
Testing Threshold: 70
XGBoost Model Accuracy: 77.72%
F1 Score: 0.2569832402234637
Confusion Matrix:
 [[2205  399]
 [ 266  115]]


### Stacking models:

In [15]:
# Load dataset
df = pd.read_csv("cleaned_data.csv")

# Thresholds to evaluate
thresholds = [50, 60, 70]

for threshold in thresholds:
    print("=" * 50)
    print(f"Testing Threshold: {threshold}")

    # Prepare data
    data = df.copy()
    data['popularity'] = (data['popularity'] > threshold).astype(int)

    X = data.drop(columns=['popularity'])
    y = data['popularity']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define base models
    base_models = [
        ('svm', SVC(kernel='rbf', C=1, probability=True, random_state=42)),
        ('knn', KNeighborsClassifier(n_neighbors=5)),
        ('rfc', RandomForestClassifier(n_estimators=100, random_state=42))
    ]

    # Define meta-model
    meta_model = LogisticRegression()

    # Stacking classifier
    stacked_model = StackingClassifier(
        estimators=base_models,
        final_estimator=meta_model,
        cv=5
    )

    # Train
    stacked_model.fit(X_train_scaled, y_train)

    # Predict
    y_pred = stacked_model.predict(X_test_scaled)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred) * 100
    print(f"Stacked Model Accuracy: {accuracy:.2f}%")
    f1 = f1_score(y_test, y_pred)

    print("F1 Score:", f1)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

Testing Threshold: 50
Stacked Model Accuracy: 59.06%
F1 Score: 0.6555806087936866
Confusion Matrix:
[[ 600  813]
 [ 409 1163]]
Testing Threshold: 60
Stacked Model Accuracy: 68.51%
F1 Score: 0.07480314960629922
Confusion Matrix:
[[2007   50]
 [ 890   38]]
Testing Threshold: 70


KeyboardInterrupt: 

In [None]:
# Load dataset
df = pd.read_csv("cleaned_data.csv")

# Thresholds to evaluate
thresholds = [50, 60, 70]

for threshold in thresholds:
    print("=" * 50)
    print(f"Testing Threshold: {threshold}")

    # Prepare data
    data = df.copy()
    data['popularity'] = (data['popularity'] > threshold).astype(int)

    X = data.drop(columns=['popularity'])
    y = data['popularity']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define base models
    base_models = [
        ('svm', SVC(kernel='rbf', C=1, probability=True, random_state=42)),
        ('knn', KNeighborsClassifier(n_neighbors=5)),
    ]

    # Define meta-model
    meta_model = LogisticRegression()

    # Stacking classifier
    stacked_model = StackingClassifier(
        estimators=base_models,
        final_estimator=meta_model,
        cv=5
    )

    # Train
    stacked_model.fit(X_train_scaled, y_train)

    # Predict
    y_pred = stacked_model.predict(X_test_scaled)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred) * 100
    print(f"Stacked Model Accuracy: {accuracy:.2f}%")
    f1 = f1_score(y_test, y_pred)

    print("F1 Score:", f1)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

Testing Threshold: 50
Stacked Model Accuracy: 59.33%
Confusion Matrix:
[[ 568  845]
 [ 369 1203]]
Testing Threshold: 60
Stacked Model Accuracy: 68.84%
Confusion Matrix:
[[2053    4]
 [ 926    2]]
Testing Threshold: 70
Stacked Model Accuracy: 87.24%
Confusion Matrix:
[[2604    0]
 [ 381    0]]


In [None]:
# Load dataset
df = pd.read_csv("cleaned_data.csv")

# Thresholds to evaluate
thresholds = [50, 60, 70]

for threshold in thresholds:
    print("=" * 50)
    print(f"Testing Threshold: {threshold}")

    # Prepare data
    data = df.copy()
    data['popularity'] = (data['popularity'] > threshold).astype(int)

    X = data.drop(columns=['popularity'])
    y = data['popularity']

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define base models
    base_models = [
        ('knn', KNeighborsClassifier(n_neighbors=5)),
        ('rfc', RandomForestClassifier(n_estimators=100, random_state=42))
    ]

    # Define meta-model
    meta_model = LogisticRegression()

    # Stacking classifier
    stacked_model = StackingClassifier(
        estimators=base_models,
        final_estimator=meta_model,
        cv=5
    )

    # Train
    stacked_model.fit(X_train_scaled, y_train)

    # Predict
    y_pred = stacked_model.predict(X_test_scaled)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred) * 100
    print(f"Stacked Model Accuracy: {accuracy:.2f}%")
    f1 = f1_score(y_test, y_pred)

    print("F1 Score:", f1)
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

Testing Threshold: 50
Stacked Model Accuracy: 57.99%
Confusion Matrix:
[[ 605  808]
 [ 446 1126]]
Testing Threshold: 60
Stacked Model Accuracy: 68.54%
Confusion Matrix:
[[2006   51]
 [ 888   40]]
Testing Threshold: 70
Stacked Model Accuracy: 87.17%
Confusion Matrix:
[[2595    9]
 [ 374    7]]
