In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold

In [2]:
def train_and_evaluate(X_train, X_test, y_train, y_test, model):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
# Load the dataset
df = pd.read_csv('Features_For_Traditional_ML_Techniques.csv')
statements = df['statement']

# Drop non-feature columns (adjust this list based on your dataset's column names)
X = df.drop(columns=['BinaryNumTarget', 'majority_target', 'statement', 'tweet'])

# Check and handle non-numeric features by one-hot encoding categorical features
X = pd.get_dummies(X, drop_first=True)

# Check for non-numeric values in the target variables
y_binary = df['BinaryNumTarget']
y_multiclass = df['majority_target']

# Convert target variables into numeric format using LabelEncoder if they are not already
label_encoder = LabelEncoder()
y_binary = label_encoder.fit_transform(y_binary)
y_multiclass = label_encoder.fit_transform(y_multiclass)

In [3]:
# For binary classification
# Split the data (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify=y_binary)

# Remove constant features (features with zero variance)
constant_filter = VarianceThreshold(threshold=0)
X_train_filtered = constant_filter.fit_transform(X_train)
X_test_filtered = constant_filter.transform(X_test)

# Apply SelectKBest on the filtered data
selector = SelectKBest(f_classif, k=20)
X_train_selected = selector.fit_transform(X_train_filtered, y_train)
X_test_selected = selector.transform(X_test_filtered)

# Update the parameter grid with more regularization for RandomForestClassifier
param_dist = {
    'n_estimators': randint(100, 300),
    'max_depth': [10, 20, None],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(2, 10),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Initialize RandomForestClassifier
rf_binary = RandomForestClassifier(random_state=42, class_weight='balanced')

# Initialize RandomizedSearchCV with updated parameters
random_search = RandomizedSearchCV(
    estimator=rf_binary,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    n_jobs=-1,
    verbose=2,
    scoring='accuracy',
    random_state=42
)

# Fit the model to the selected training data
random_search.fit(X_train_selected, y_train)

# Print the best hyperparameters found
print(f"Best Hyperparameters for Binary Classification: {random_search.best_params_}")

# Get the best model
best_rf_binary = random_search.best_estimator_

# Train and evaluate the model
print("Training and Evaluating on 2-Class Target (BinaryNumTarget):")
train_and_evaluate(X_train_selected, X_test_selected, y_train, y_test, best_rf_binary)

# For Multi-class classification
# Split the data (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y_multiclass, test_size=0.2, random_state=42, stratify=y_multiclass)

# Remove constant features
X_train_filtered = constant_filter.fit_transform(X_train)
X_test_filtered = constant_filter.transform(X_test)

# Apply SelectKBest on the filtered data
X_train_selected = selector.fit_transform(X_train_filtered, y_train)
X_test_selected = selector.transform(X_test_filtered)

# Initialize RandomForestClassifier
rf_multiclass = RandomForestClassifier(random_state=42, class_weight='balanced')

# Initialize RandomizedSearchCV with updated parameters
random_search_multiclass = RandomizedSearchCV(
    estimator=rf_multiclass,
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    n_jobs=-1,
    verbose=2,
    scoring='accuracy',
    random_state=42
)

# Fit the model to the selected training data
random_search_multiclass.fit(X_train_selected, y_train)

# Print the best hyperparameters found
print(f"Best Hyperparameters for Multiclass Classification: {random_search_multiclass.best_params_}")

# Get the best model
best_rf_multiclass = random_search_multiclass.best_estimator_

# Train and evaluate the model
print("Training and Evaluating on Multiclass Target (majority_target):")
train_and_evaluate(X_train_selected, X_test_selected, y_train, y_test, best_rf_multiclass)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters for Binary Classification: {'bootstrap': True, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 220}
Training and Evaluating on 2-Class Target (BinaryNumTarget):
Accuracy: 0.9948

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     13054
           1       0.99      1.00      0.99     13786

    accuracy                           0.99     26840
   macro avg       0.99      0.99      0.99     26840
weighted avg       0.99      0.99      0.99     26840

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters for Multiclass Classification: {'bootstrap': False, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Training and Evaluating on Multiclass Target (majority_target):
Accuracy: 0.9532

Cl

In [12]:
# Predicts for all the training and test data for binary and multiclass
y_train_pred_binary = best_rf_binary.predict(X_train_selected)
y_test_pred_binary = best_rf_binary.predict(X_test_selected)

y_train_pred_multiclass = best_rf_multiclass.predict(X_train_selected)
y_test_pred_multiclass = best_rf_multiclass.predict(X_test_selected)

# Adds predicted labels to the original dataset for train and test for binary and multiclass
df_train_binary = X_train.copy()  # Copy the training data for binary
df_train_binary['predicted'] = y_train_pred_binary  # Add the binary predictions
df_train_binary['truth_value'] = y_train  # Add the actual truth values

df_test_binary = X_test.copy()  # Copy the test data for binary
df_test_binary['predicted'] = y_test_pred_binary  # Add the binary predictions
df_test_binary['truth_value'] = y_test  # Add the actual truth values

df_train_multiclass = X_train.copy()  # Copy the training data for multiclass
df_train_multiclass['predicted'] = y_train_pred_multiclass  # Add the multiclass predictions
df_train_multiclass['truth_value'] = y_train  # Add the actual truth values

df_test_multiclass = X_test.copy()  # Copy the test data for multiclass
df_test_multiclass['predicted'] = y_test_pred_multiclass  # Add the multiclass predictions
df_test_multiclass['truth_value'] = y_test  # Add the actual truth values

# Group by statement and calculate the majority of predictions (majority voting) for binary and multiclass
# Binary classification (majority voting for predicted values)
statement_preds_train_binary = df_train_binary.groupby(statements)['predicted'].agg(lambda x: 1 if x.sum() / len(x) > 0.5 else 0).reset_index()
statement_preds_test_binary = df_test_binary.groupby(statements)['predicted'].agg(lambda x: 1 if x.sum() / len(x) > 0.5 else 0).reset_index()

# Adds the truth_value column to the grouped dataframe for binary
statement_preds_train_binary['truth_value'] = df_train_binary.groupby(statements)['truth_value'].first().values
statement_preds_test_binary['truth_value'] = df_test_binary.groupby(statements)['truth_value'].first().values

# Multiclass classification (majority voting for predicted values)
statement_preds_train_multiclass = df_train_multiclass.groupby(statements)['predicted'].agg(lambda x: x.mode()[0]).reset_index()
statement_preds_test_multiclass = df_test_multiclass.groupby(statements)['predicted'].agg(lambda x: x.mode()[0]).reset_index()

# Adds the truth_value column to the grouped dataframe for multiclass
statement_preds_train_multiclass['truth_value'] = df_train_multiclass.groupby(statements)['truth_value'].first().values
statement_preds_test_multiclass['truth_value'] = df_test_multiclass.groupby(statements)['truth_value'].first().values



In [13]:
def display_totals(df, class_type="Binary"):
    truth_value_totals = df['truth_value'].apply(lambda x: 1 if x == 1 else 0).value_counts()
    predicted_totals = df['predicted'].apply(lambda x: 1 if x == 1 else 0).value_counts()
    
    print(f"Total True/False for {class_type} Classification - Actual (Truth Values):")
    print(f"True: {truth_value_totals.get(1, 0)}, False: {truth_value_totals.get(0, 0)}")
    
    print(f"Total True/False for {class_type} Classification - Predicted Values:")
    print(f"True: {predicted_totals.get(1, 0)}, False: {predicted_totals.get(0, 0)}")
    
    print("-" * 50)

# Display the totals for binary classification (training and test data)
print("Binary Classification - Training Data Totals:")
display_totals(df_train_binary, class_type="Binary")

print("Binary Classification - Test Data Totals:")
display_totals(df_test_binary, class_type="Binary")

# Display the totals for multiclass classification (training and test data)
print("Multiclass Classification - Training Data Totals:")
display_totals(df_train_multiclass, class_type="Multiclass")

print("Multiclass Classification - Test Data Totals:")
display_totals(df_test_multiclass, class_type="Multiclass")

Binary Classification - Training Data Totals:
Total True/False for Binary Classification - Actual (Truth Values):
True: 55188, False: 52170
Total True/False for Binary Classification - Predicted Values:
True: 55238, False: 52120
--------------------------------------------------
Binary Classification - Test Data Totals:
Total True/False for Binary Classification - Actual (Truth Values):
True: 13797, False: 13043
Total True/False for Binary Classification - Predicted Values:
True: 13836, False: 13004
--------------------------------------------------
Multiclass Classification - Training Data Totals:
Total True/False for Multiclass Classification - Actual (Truth Values):
True: 55188, False: 52170
Total True/False for Multiclass Classification - Predicted Values:
True: 55021, False: 52337
--------------------------------------------------
Multiclass Classification - Test Data Totals:
Total True/False for Multiclass Classification - Actual (Truth Values):
True: 13797, False: 13043
Total Tr