# Testing all models with balanced (at 70)

## Imports required:

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error,r2_score,accuracy_score, confusion_matrix
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.svm import SVR,SVC
from imblearn.over_sampling import SMOTE 
from sklearn.neighbors import KNeighborsClassifier ,KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor, StackingClassifier
from xgboost import XGBClassifier


## Regression models:

### Data split

In [31]:
df= pd.read_csv("balanced.csv")

X = df.drop(columns=['popularity'])  # Replace 'target_column' with your actual target variable
y = df['popularity']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.squeeze()
y_test = y_test.squeeze()


### Linear regression:

In [32]:
# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error & R-squared Score
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test, y_test)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Tolerance-Based Accuracy Calculation
tolerance = 10  # Absolute error margin

# Count correct predictions within tolerance
correct_predictions = np.sum(np.abs(y_test - y_pred) <= tolerance)

# Calculate accuracy
accuracy_with_tolerance = (correct_predictions / len(y_test)) * 100

print(f"Model Accuracy (with ±{tolerance}% tolerance): {accuracy_with_tolerance:.2f}%")


Mean Squared Error: 424.5210
R-squared Score: 0.0795
Model Accuracy (with ±10% tolerance): 29.27%


### Polynomial Regression:

In [33]:
# Set polynomial degree
degree = 3  # You can experiment with higher degrees

# Transform features into polynomial features
poly = PolynomialFeatures(degree=degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Make predictions
y_pred = model.predict(X_test_poly)

# Calculate Mean Squared Error & R-squared Score
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test_poly, y_test)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Tolerance-Based Accuracy Calculation
tolerance = 10  # Absolute error margin
correct_predictions = np.sum(np.abs(y_test - y_pred) <= tolerance)
accuracy_with_tolerance = (correct_predictions / len(y_test)) * 100

print(f"Model Accuracy (with ±{tolerance}% tolerance): {accuracy_with_tolerance:.2f}%")

Mean Squared Error: 408.0189
R-squared Score: 0.1153
Model Accuracy (with ±10% tolerance): 33.42%


### KNN Regressor:

In [34]:
# Initialize the model (choose k=5 as a starting point)
model = KNeighborsRegressor(n_neighbors=5)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error & R-squared Score
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test, y_test)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Tolerance-Based Accuracy Calculation (±10 margin)
tolerance = 10  # Absolute error margin

# Count correct predictions within tolerance
correct_predictions = np.sum(np.abs(y_test - y_pred) <= tolerance)

# Calculate tolerance-based accuracy
accuracy_with_tolerance = (correct_predictions / len(y_test)) * 100
print(f"Model Accuracy (with ±{tolerance} margin): {accuracy_with_tolerance:.2f}%")


Mean Squared Error: 513.2436
R-squared Score: -0.1128
Model Accuracy (with ±10 margin): 37.05%


### Random Forest Regression:

In [35]:
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error & R-squared Score
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test, y_test)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Tolerance-Based Accuracy Calculation (±10 margin)
tolerance = 10  # Absolute error margin

# Count correct predictions within tolerance
correct_predictions = np.sum(np.abs(y_test - y_pred) <= tolerance)

# Calculate tolerance-based accuracy
accuracy_with_tolerance = (correct_predictions / len(y_test)) * 100
print(f"Model Accuracy (with ±{tolerance} margin): {accuracy_with_tolerance:.2f}%")


Mean Squared Error: 431.5407
R-squared Score: 0.0643
Model Accuracy (with ±10 margin): 35.36%


### SVR

In [37]:
# Standardize features and target
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Reshape y to 2D before scaling, then back to 1D
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).ravel()

# Initialize the model with RBF kernel (default)
model = SVR(kernel='rbf')

# Train the model
model.fit(X_train_scaled, y_train_scaled)
y_pred_scaled = model.predict(X_test_scaled)

# Convert predictions back to original scale
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()

# Calculate Mean Squared Error & R-squared Score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared Score: {r2:.4f}")

# Tolerance-Based Accuracy Calculation (±10 margin)
tolerance = 10  # Absolute error margin

# Count correct predictions within tolerance
correct_predictions = np.sum(np.abs(y_test - y_pred) <= tolerance)

# Calculate tolerance-based accuracy
accuracy_with_tolerance = (correct_predictions / len(y_test)) * 100
print(f"Model Accuracy (with ±{tolerance} margin): {accuracy_with_tolerance:.2f}%")


Mean Squared Error: 430.1851
R-squared Score: 0.0673
Model Accuracy (with ±10 margin): 48.32%


## Classification models:

### Logistical Regression

In [27]:
# Load dataset
df = pd.read_csv("balanced.csv")

threshold = 70  # Set threshold to 70

# Create a fresh copy of the data
data = df.copy()

# Convert popularity to binary based on the threshold
data['popularity'] = (data['popularity'] > threshold).astype(int)

# Define features and target
X = data.drop(columns=['popularity'])
y = data['popularity']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Threshold: {threshold}")
print(f"Model Accuracy: {accuracy:.2f}%")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Threshold: 70
Model Accuracy: 61.27%
Confusion Matrix:
[[211 180]
 [119 262]]


### KNN 

In [22]:
# Load dataset
df = pd.read_csv("balanced.csv")

# Set threshold
threshold = 70
print("=" * 50)
print(f"Testing Threshold: {threshold}")

# Binarize target based on threshold
df['popularity'] = (df['popularity'] > threshold).astype(int)

# Define features and target
X = df.drop(columns=['popularity'])
y = df['popularity']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# KNN model
k = 5
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train_scaled, y_train)

# Predict
y_pred = knn.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"KNN Model with k={k}")
print(f"Model Accuracy: {accuracy:.2f}%")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Testing Threshold: 70
KNN Model with k=5
Model Accuracy: 58.16%
Confusion Matrix:
[[207 184]
 [139 242]]


### RFC

In [23]:
# Load dataset
df = pd.read_csv("balanced.csv")

# Set threshold
threshold = 70
print("=" * 50)
print(f"Testing Threshold: {threshold}")

# Convert target variable: Popularity > threshold → 1, else 0
df['popularity'] = (df['popularity'] > threshold).astype(int)

# Define features and target
X = df.drop(columns=['popularity'])
y = df['popularity']

# Split data (80% train, 20% test) with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature scaling (optional for Random Forest, but kept for consistency)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Random Forest model
rfc = RandomForestClassifier(
    n_estimators=500,
    random_state=42,
    class_weight="balanced",
    max_depth=20,
    min_samples_split=5
)
rfc.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rfc.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Model Accuracy: {accuracy:.2f}%")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Testing Threshold: 70
Model Accuracy: 62.31%
Confusion Matrix:
 [[234 157]
 [134 247]]


### SVM

In [25]:
# Load dataset
df = pd.read_csv("balanced.csv")

# Set threshold
threshold = 70
print("=" * 50)
print(f"Testing Threshold: {threshold}")

# Convert popularity to binary target
df['popularity'] = (df['popularity'] > threshold).astype(int)

# Define features and target
X = df.drop(columns=['popularity'])
y = df['popularity']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train SVM
svm = SVC(kernel='rbf', C=1, probability=True, random_state=42)
svm.fit(X_train_scaled, y_train)

# Predictions
y_pred = svm.predict(X_test_scaled)

# Evaluation
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"SVM Model Accuracy: {accuracy:.2f}%")

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Testing Threshold: 70
SVM Model Accuracy: 62.18%
Confusion Matrix:
 [[198 193]
 [ 99 282]]


### XGB 

In [38]:
# Load dataset
df = pd.read_csv("balanced.csv")

# Set threshold
threshold = 70
print("=" * 50)
print(f"Testing Threshold: {threshold}")

# Convert popularity to binary based on threshold
df['popularity'] = (df['popularity'] > threshold).astype(int)

# Define features and target
X = df.drop(columns=['popularity'])
y = df['popularity']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize XGBoost Classifier
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=8,
    random_state=42,
    scale_pos_weight=5
)

# Train model
xgb.fit(X_train_scaled, y_train)

# Predict
y_pred = xgb.predict(X_test_scaled)

# Evaluation
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"XGBoost Model Accuracy: {accuracy:.2f}%")
print("Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

Testing Threshold: 70
XGBoost Model Accuracy: 62.31%
Confusion Matrix:
[[151 240]
 [ 51 330]]


### Stacking models

In [41]:
# Load dataset
df = pd.read_csv("balanced.csv")

# Set threshold
threshold = 70
df['popularity'] = (df['popularity'] > threshold).astype(int)

# Define features and target
X = df.drop(columns=['popularity'])
y = df['popularity']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define base models
base_models = [
    ('svm', SVC(kernel='rbf', C=1, probability=True, random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
    ('rfc', RandomForestClassifier(n_estimators=100, random_state=42))
]

# Define meta-model (stacker)
meta_model = LogisticRegression()

# Create StackingClassifier
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5
)

# Train stacked model
stacked_model.fit(X_train_scaled, y_train)

# Predict
y_pred = stacked_model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Stacked Model Accuracy: {accuracy:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Stacked Model Accuracy: 63.34%
Confusion Matrix:
[[231 160]
 [123 258]]


In [51]:
# Load dataset
df = pd.read_csv("balanced.csv")

# Set threshold
threshold = 70
df['popularity'] = (df['popularity'] > threshold).astype(int)

# Define features and target
X = df.drop(columns=['popularity'])
y = df['popularity']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define base models
base_models = [
    ('svm', SVC(kernel='rbf', C=1, probability=True, random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
]

# Define meta-model (stacker)
meta_model = LogisticRegression()

# Create StackingClassifier
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5
)

# Train stacked model
stacked_model.fit(X_train_scaled, y_train)

# Predict
y_pred = stacked_model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Stacked Model Accuracy: {accuracy:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Stacked Model Accuracy: 63.47%
Confusion Matrix:
[[228 163]
 [119 262]]


In [43]:
# Load dataset
df = pd.read_csv("cleaned_data.csv")

# Set threshold
threshold = 70
df['popularity'] = (df['popularity'] > threshold).astype(int)

# Define features and target
X = df.drop(columns=['popularity'])
y = df['popularity']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define base models
base_models = [
    ('knn', KNeighborsClassifier(n_neighbors=5)),
    ('rfc', RandomForestClassifier(n_estimators=100, random_state=42))
]

# Define meta-model (stacker)
meta_model = LogisticRegression()

# Create StackingClassifier
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5
)

# Train stacked model
stacked_model.fit(X_train_scaled, y_train)

# Predict
y_pred = stacked_model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Stacked Model Accuracy: {accuracy:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Stacked Model Accuracy: 87.17%
Confusion Matrix:
[[2595    9]
 [ 374    7]]


In [None]:
# Load dataset
df = pd.read_csv("balanced.csv")

# Set threshold
threshold = 70
df['popularity'] = (df['popularity'] > threshold).astype(int)

# Define features and target
X = df.drop(columns=['popularity'])
y = df['popularity']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define base models
base_models = [
    ('knn', KNeighborsClassifier(n_neighbors=5)),
    ('logreg', LogisticRegression())
]

# Define meta-model (SVM)
meta_model = SVC(kernel='rbf', C=1, probability=True, random_state=42)

# Create StackingClassifier
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5
)

# Train stacked model
stacked_model.fit(X_train_scaled, y_train)

# Predict
y_pred = stacked_model.predict(X_test_scaled)

# Evaluate
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Stacked Model Accuracy (SVM as meta-model): {accuracy:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Stacked Model Accuracy (SVM as meta-model): 58.16%
Confusion Matrix:
[[207 184]
 [139 242]]
