## Traditional Methods of Feature engineering and ML

In [2]:
import pandas as pd

df = pd.read_csv("../data/reddit_ai_cleaned.csv")

In [3]:
# add bianry label to predict which posts will get high engagement
df["high_score"] = (df["score"] >= df["score"].median()).astype(int)
df.head()

Unnamed: 0,title_clean,selftext_clean,subreddit,score,upvote_ratio,num_comments,post_length,sentiment_polarity,sentiment_subjectivity,created_datetime,date,year,text,high_score
0,How to retrieve instructions given to annotato...,"Hello, I am a communications student, and as p...",MachineLearning,13,0.93,7,171,0.095833,0.404333,2025-10-10 08:49:17,2025-10-10,2025,How to retrieve instructions given to annotato...,1
1,Built an ML-based Variant Impact Predictor non...,"Hey folks, I ve been working on a small ML pro...",MachineLearning,0,0.4,10,244,0.131198,0.393388,2025-10-09 18:40:15,2025-10-09,2025,Built an ML-based Variant Impact Predictor non...,0
2,Tensorflow and Musicnn,"Hi all, I m struggling with Tensorflow and an ...",MachineLearning,1,0.57,11,386,0.135743,0.404892,2025-10-06 07:19:05,2025-10-06,2025,"Tensorflow and Musicnn Hi all, I m struggling ...",0
3,Experiences with active learning for real appl...,I'm tinkering with an application of human pos...,MachineLearning,4,0.83,6,310,-0.025505,0.501641,2025-10-04 21:53:21,2025-10-04,2025,Experiences with active learning for real appl...,0
4,Thesis direction mechanistic interpretability ...,"Hi all, I'm an undergrad Computer Science stud...",MachineLearning,12,0.8,13,165,0.219318,0.450271,2025-10-02 20:50:44,2025-10-02,2025,Thesis direction mechanistic interpretability ...,1


In [4]:
from sklearn.preprocessing import StandardScaler

("numeric", StandardScaler(), ["post_length", "sentiment_polarity", "sentiment_subjectivity"])

('numeric',
 StandardScaler(),
 ['post_length', 'sentiment_polarity', 'sentiment_subjectivity'])

### Logistic Regression

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, roc_auc_score

X = df[[
    "title_clean",
    "selftext_clean",
    "post_length",
    "sentiment_polarity",
    "sentiment_subjectivity"
]]
y = df["high_score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf = TfidfVectorizer(
    min_df=2,
    max_df=0.90,
    ngram_range=(1,2),
    stop_words="english",
    sublinear_tf=True
)

preprocess = ColumnTransformer(
    transformers=[
        ("tfidf_title", tfidf, "title_clean"),
        ("tfidf_text", tfidf, "selftext_clean"),
        ("numeric", StandardScaler(), ["post_length",
                                       "sentiment_polarity",
                                       "sentiment_subjectivity"])
    ]
)

clf = LogisticRegression(max_iter=1000, class_weight="balanced")
model = Pipeline([
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=500))
])

model.fit(X_train, y_train)
preds = model.predict(X_test)
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

           0       0.74      0.50      0.60        52
           1       0.55      0.78      0.65        41

    accuracy                           0.62        93
   macro avg       0.65      0.64      0.62        93
weighted avg       0.66      0.62      0.62        93



### Linear SVC

In [7]:
model2 = Pipeline([
    ("preprocess", preprocess),
    ("clf", LinearSVC(class_weight="balanced", max_iter=5000, C=0.5))
])

model2.fit(X_train, y_train)
preds = model2.predict(X_test)

print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.72      0.63      0.67        52
           1       0.60      0.68      0.64        41

    accuracy                           0.66        93
   macro avg       0.66      0.66      0.65        93
weighted avg       0.66      0.66      0.66        93



### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV


In [10]:
# Option 2: With hyperparameter tuning
model_rf_tuned = Pipeline([
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(class_weight="balanced", random_state=42))
])

# Define parameter grid
param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__max_features': ['sqrt', 'log2', 0.5]  # Important for high-dimensional TF-IDF
}

# Use RandomizedSearchCV for faster tuning (or GridSearchCV for exhaustive)
from sklearn.model_selection import RandomizedSearchCV

rf_search = RandomizedSearchCV(
    model_rf_tuned,
    param_grid,
    n_iter=20,  # Number of parameter settings to sample
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

rf_search.fit(X_train, y_train)

print("Best parameters:", rf_search.best_params_)
print("Best CV score:", rf_search.best_score_)

# Evaluate on test set
best_rf = rf_search.best_estimator_
preds_rf_best = best_rf.predict(X_test)

print("\nTuned Random Forest Classification Report:")
print(classification_report(y_test, preds_rf_best))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters: {'clf__n_estimators': 200, 'clf__min_samples_split': 2, 'clf__min_samples_leaf': 2, 'clf__max_features': 0.5, 'clf__max_depth': 20}
Best CV score: 0.559738198339088

Tuned Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.56      0.64        52
           1       0.58      0.78      0.67        41

    accuracy                           0.66        93
   macro avg       0.67      0.67      0.66        93
weighted avg       0.68      0.66      0.65        93

