In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
import json

#########################################
# BLOCK 1: Load and Prepare Data
#########################################
# Load the CSV file
output_csv_path = "output.csv"
df = pd.read_csv(output_csv_path)

# Extract necessary columns
df = df[["comments_count", "like_count"]].dropna()  # Use only comments_count for regression
df["comments_count"] = df["comments_count"].astype(float)  # Ensure numeric type

# Split the data
X = df[["comments_count"]]  # Feature: comments_count
y = df["like_count"]        # Target: like_count

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

#########################################
# BLOCK 2: Train and Evaluate Multiple Models
#########################################

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Support Vector Regressor (SVR)": SVR(kernel='rbf', C=1.0, epsilon=0.1),
    "XGBoost Regressor": xgb.XGBRegressor(
        objective="reg:squarederror",
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        seed=42
    )
}

# Store metrics for comparison
metrics = []

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Append metrics
    metrics.append({
        "Model": name,
        "MSE": mse,
        "R^2 Score": r2
    })
    
    print(f"Model: {name}")
    print(f"  MSE: {mse}")
    print(f"  R^2 Score: {r2}")
    print()

#########################################
# BLOCK 3: Compare Models
#########################################
metrics_df = pd.DataFrame(metrics)
metrics_df.sort_values(by="R^2 Score", ascending=False, inplace=True)

print("Model Comparison:")
print(metrics_df)

#########################################
# BLOCK 4: Best Model for Predictions
#########################################
# Select the best model (highest R² score)
best_model_name = metrics_df.iloc[0]["Model"]
best_model = models[best_model_name]

# Predict on test data for `test-regression-round1.jsonl`
test_reg_in_path = "test-regression-round1.jsonl"
test_reg_out_path = "test-regression-round1out.jsonl"

output_dict = {}

with open(test_reg_in_path, "r", encoding="utf-8") as fh:
    for line in fh:
        sample = json.loads(line)
        post_id = sample.get("id", "NO_ID")
        comments_count = sample.get("comments_count", 0)

        # Prepare the feature for prediction
        X_test_vec = pd.DataFrame([[comments_count]], columns=["comments_count"])
        pred_val = best_model.predict(X_test_vec)[0]
        like_count_pred = int(round(pred_val))

        output_dict[post_id] = like_count_pred

# Write output
with open(test_reg_out_path, "w", encoding="utf-8") as f:
    json.dump(output_dict, f, ensure_ascii=False, indent=2)

print(f"Best Model: {best_model_name}")
print(f"Predictions written to: {test_reg_out_path}")

Model: Linear Regression
  MSE: 2717831151.3651333
  R^2 Score: -0.2516456858247138

Model: Random Forest
  MSE: 1911804141.4293933
  R^2 Score: 0.11955479480025966



KeyboardInterrupt: 