In [2]:
import pandas as pd

In [None]:
df = pd.read_csv('reddit_preprocessing.csv').dropna()
df.shape

(36662, 2)

In [10]:
import optuna
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Remove rows where the target labels (category) are NaN
df = df.dropna(subset=['category'])

# Step 2: TF-IDF vectorization setup
ngram_range = (1, 3)  # Trigram
max_features = 1000  # Set max_features to 1000
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)

# **Step 3: Train-test split**
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category']
)

# **Step 4: Fit TF-IDF only on training data**
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# **Step 5: Compute Class Weights Instead of SMOTE**
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
class_weight_dict = {cls: weight for cls, weight in zip(np.unique(y_train), class_weights)}

# Step 6: Optuna objective function with Stratified K-Fold CV
def objective_logreg(trial):
    C = trial.suggest_float('C', 1e-4, 10.0, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = 'liblinear' if penalty == 'l1' else 'lbfgs'

    model = LogisticRegression(
        C=C, penalty=penalty, solver=solver, class_weight=class_weight_dict, random_state=42
    )

    # Perform Stratified K-Fold Cross-Validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train_vec, y_train, cv=skf, scoring='accuracy')

    return scores.mean()  # Return mean accuracy across folds

# Step 7: Run Optuna with early stopping
def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_logreg, n_trials=50, timeout=600)  # 50 trials or 10 min max

    # Get the best parameters
    best_params = study.best_params
    best_model = LogisticRegression(
        C=best_params['C'], penalty=best_params['penalty'], 
        solver='liblinear' if best_params['penalty'] == 'l1' else 'lbfgs',
        class_weight=class_weight_dict, random_state=42
    )

    # Train final model on full training data
    best_model.fit(X_train_vec, y_train)

    # Evaluate on test data
    y_pred = best_model.predict(X_test_vec)
    test_accuracy = accuracy_score(y_test, y_pred)

    print(f"Best Parameters: {best_params}")
    print(f"Test Accuracy: {test_accuracy:.4f}")

# Run the experiment
run_optuna_experiment()


[I 2025-02-10 23:43:38,777] A new study created in memory with name: no-name-a9431626-1d40-4bd4-87f0-498fbe5b8174
[I 2025-02-10 23:43:38,940] Trial 0 finished with value: 0.4301544554731719 and parameters: {'C': 0.006401481341040295, 'penalty': 'l1'}. Best is trial 0 with value: 0.4301544554731719.
[I 2025-02-10 23:43:39,175] Trial 1 finished with value: 0.6947045947153376 and parameters: {'C': 0.06098699434978811, 'penalty': 'l1'}. Best is trial 1 with value: 0.6947045947153376.
[I 2025-02-10 23:43:39,355] Trial 2 finished with value: 0.6762929116857908 and parameters: {'C': 0.0031574399936313753, 'penalty': 'l2'}. Best is trial 1 with value: 0.6947045947153376.
[I 2025-02-10 23:43:39,485] Trial 3 finished with value: 0.6673257336555043 and parameters: {'C': 0.0010035266167671242, 'penalty': 'l2'}. Best is trial 1 with value: 0.6947045947153376.
[I 2025-02-10 23:43:39,625] Trial 4 finished with value: 0.4301544554731719 and parameters: {'C': 0.005810024987815241, 'penalty': 'l1'}. Bes

Best Parameters: {'C': 1.0415352084601333, 'penalty': 'l1'}
Test Accuracy: 0.7984


In [13]:
import optuna
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Ensure df is defined (Replace with actual DataFrame loading)
# df = pd.read_csv("your_dataset.csv")  # Uncomment this line if needed

# Step 1: Remove rows where the target labels (category) are NaN
df = df.dropna(subset=['category'])

# Step 2: TF-IDF vectorization setup
ngram_range = (1, 3)  # Trigram
max_features = 10000  # Set max_features to 10000
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)

# **Step 3: Train-test split**
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category']
)

# **Step 4: Fit TF-IDF only on training data**
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# **Step 5: Compute Class Weights Correctly**
all_classes = np.unique(np.concatenate([y_train, y_test]))  # Ensure all labels are considered
class_weights = compute_class_weight(class_weight="balanced", classes=all_classes, y=y_train)
class_weight_dict = {cls: weight for cls, weight in zip(all_classes, class_weights)}

# Step 6: Optuna objective function with Stratified K-Fold CV
def objective_logreg(trial):
    C = trial.suggest_float('C', 1e-4, 10.0, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = 'liblinear' if penalty == 'l1' else 'lbfgs'

    model = LogisticRegression(
        C=C, penalty=penalty, solver=solver, class_weight=class_weight_dict, random_state=42
    )

    # Perform Stratified K-Fold Cross-Validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train_vec, y_train, cv=skf, scoring='accuracy', n_jobs=-1)

    return scores.mean()  # Return mean accuracy across folds

# Step 7: Run Optuna Optimization
def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_logreg, n_trials=50)  # 50 trials

    # Get the best parameters
    best_params = study.best_params
    best_model = LogisticRegression(
        C=best_params['C'], penalty=best_params['penalty'], 
        solver='liblinear' if best_params['penalty'] == 'l1' else 'lbfgs',
        class_weight=class_weight_dict, random_state=42
    )

    # Train final model on full training data
    best_model.fit(X_train_vec, y_train)

    # Evaluate on test data
    y_pred = best_model.predict(X_test_vec)
    test_accuracy = accuracy_score(y_test, y_pred)

    print(f"Best Parameters: {best_params}")
    print(f"Test Accuracy: {test_accuracy:.4f}")

# Run the experiment
run_optuna_experiment()


[I 2025-02-10 23:45:50,838] A new study created in memory with name: no-name-38863503-39fa-4f5b-9bb6-d213e8f4bd65
[I 2025-02-10 23:45:53,463] Trial 0 finished with value: 0.878925174303404 and parameters: {'C': 2.2943012059790817, 'penalty': 'l1'}. Best is trial 0 with value: 0.878925174303404.
[I 2025-02-10 23:45:55,086] Trial 1 finished with value: 0.6919089212939509 and parameters: {'C': 0.007244749055966219, 'penalty': 'l2'}. Best is trial 0 with value: 0.878925174303404.
[I 2025-02-10 23:45:56,851] Trial 2 finished with value: 0.7914689561618983 and parameters: {'C': 0.2696153201010838, 'penalty': 'l1'}. Best is trial 0 with value: 0.878925174303404.
[I 2025-02-10 23:45:57,111] Trial 3 finished with value: 0.4301544554731719 and parameters: {'C': 0.0014493741104949058, 'penalty': 'l1'}. Best is trial 0 with value: 0.878925174303404.
[I 2025-02-10 23:45:58,157] Trial 4 finished with value: 0.840328571399505 and parameters: {'C': 2.7943140231269403, 'penalty': 'l2'}. Best is trial 0

Best Parameters: {'C': 3.0460529788597395, 'penalty': 'l1'}
Test Accuracy: 0.8803


In [14]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Ensure df is defined (Replace with actual DataFrame loading)
# df = pd.read_csv("your_dataset.csv")  # Uncomment if needed

# Step 1: Remove rows where the target labels (category) are NaN
df = df.dropna(subset=['category'])

# Step 2: TF-IDF vectorization setup
ngram_range = (1, 3)  # Trigram
max_features = 10000  # Set max_features to 10000
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)

# **Step 3: Train-test split**
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category']
)

# **Step 4: Fit TF-IDF only on training data**
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# **Step 5: Compute Class Weights Correctly**
all_classes = np.unique(np.concatenate([y_train, y_test]))  # Ensure all labels are considered
class_weights = compute_class_weight(class_weight="balanced", classes=all_classes, y=y_train)
class_weight_dict = {cls: weight for cls, weight in zip(all_classes, class_weights)}

# **Step 6: Train the Best Model**
best_model = LogisticRegression(
    C=3.04, penalty='l1', solver='liblinear', class_weight=class_weight_dict, random_state=42
)

# Train the model
best_model.fit(X_train_vec, y_train)

# Make predictions
y_pred = best_model.predict(X_test_vec)

# **Step 7: Evaluate the Model**
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Test Accuracy: 0.8803
Classification Report:
              precision    recall  f1-score   support

          -1       0.84      0.78      0.81      1650
           0       0.86      0.96      0.91      2529
           1       0.92      0.87      0.90      3154

    accuracy                           0.88      7333
   macro avg       0.87      0.87      0.87      7333
weighted avg       0.88      0.88      0.88      7333



In [4]:
df = pd.read_csv('../reddit_preprocessing.csv').dropna()
df.shape

(36662, 2)

In [6]:
import optuna
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 1: Remove rows where the target labels (category) are NaN
df = df.dropna(subset=['category'])

# Step 2: TF-IDF vectorization setup
ngram_range = (1, 3)  # Trigram
max_features = 10000  # Set max_features to 1000
vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)

# **Step 3: Train-test split**
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category']
)

# **Step 4: Fit TF-IDF only on training data**
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 5: Optuna objective function with Stratified K-Fold CV
def objective_logreg(trial):
    C = trial.suggest_float('C', 1e-4, 10.0, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = 'liblinear' if penalty == 'l1' else 'lbfgs'

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []

    # Perform cross-validation manually
    for train_idx, val_idx in skf.split(X_train_vec, y_train):
        X_fold_train, X_fold_val = X_train_vec[train_idx], X_train_vec[val_idx]
        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # ✅ Compute Class Weights ONLY on the training fold (Fix Data Leakage)
        fold_classes = np.unique(y_fold_train)
        fold_class_weights = compute_class_weight(class_weight="balanced", classes=fold_classes, y=y_fold_train)
        fold_class_weight_dict = {cls: fold_class_weights[i] for i, cls in enumerate(fold_classes)}

        # Train Logistic Regression model on this fold
        model = LogisticRegression(
            C=C, penalty=penalty, solver=solver, class_weight=fold_class_weight_dict, random_state=42
        )
        model.fit(X_fold_train, y_fold_train)

        # Evaluate on validation fold
        y_val_pred = model.predict(X_fold_val)
        fold_accuracy = accuracy_score(y_fold_val, y_val_pred)
        scores.append(fold_accuracy)

    return np.mean(scores)  # Return mean accuracy across folds

# Step 6: Run Optuna with early stopping
def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_logreg, n_trials=50)  # 50 trials

    # Get the best parameters
    best_params = study.best_params
    best_model = LogisticRegression(
        C=best_params['C'], penalty=best_params['penalty'], 
        solver='liblinear' if best_params['penalty'] == 'l1' else 'lbfgs',
        class_weight="balanced",  # Now safe to compute globally on `y_train`
        random_state=42
    )

    # Train final model on full training data
    best_model.fit(X_train_vec, y_train)

    # Evaluate on test data
    y_pred = best_model.predict(X_test_vec)
    test_accuracy = accuracy_score(y_test, y_pred)

    print(f"Best Parameters: {best_params}")
    print(f"Test Accuracy: {test_accuracy:.4f}")

# Run the experiment
run_optuna_experiment()


[I 2025-02-11 00:13:10,095] A new study created in memory with name: no-name-9e0ca7c6-b117-42b6-9856-9a7946591b7b
[I 2025-02-11 00:13:11,167] Trial 0 finished with value: 0.8785842264684227 and parameters: {'C': 2.1613827154073753, 'penalty': 'l1'}. Best is trial 0 with value: 0.8785842264684227.
[I 2025-02-11 00:13:11,402] Trial 1 finished with value: 0.6807936381982491 and parameters: {'C': 0.00121267920942315, 'penalty': 'l2'}. Best is trial 0 with value: 0.8785842264684227.
[I 2025-02-11 00:13:11,843] Trial 2 finished with value: 0.6985917372033383 and parameters: {'C': 0.011992363614706344, 'penalty': 'l2'}. Best is trial 0 with value: 0.8785842264684227.
[I 2025-02-11 00:13:12,036] Trial 3 finished with value: 0.6779977554994188 and parameters: {'C': 0.00010415126926001264, 'penalty': 'l2'}. Best is trial 0 with value: 0.8785842264684227.
[I 2025-02-11 00:13:13,333] Trial 4 finished with value: 0.878481936304666 and parameters: {'C': 1.9786901345650996, 'penalty': 'l1'}. Best is 

Best Parameters: {'C': 2.873718841872781, 'penalty': 'l1'}
Test Accuracy: 0.8800
