In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Function to train and evaluate the model
def train_and_evaluate(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred) 
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

# Load the dataset
df = pd.read_csv('Features_For_Traditional_ML_Techniques.csv')

# Drop non-feature columns that are not relevant
X = df.drop(columns=['BinaryNumTarget', 'majority_target', 'statement', 'tweet'])

# Check and handle non-numeric features by one-hot encoding categorical features
X = pd.get_dummies(X, drop_first=True)

# Check for non-numeric values in the target variables
y_binary = df['BinaryNumTarget']
y_multiclass = df['majority_target']

# Convert target variables into numeric format using LabelEncoder
label_encoder = LabelEncoder()
y_binary = label_encoder.fit_transform(y_binary)
y_multiclass = label_encoder.fit_transform(y_multiclass)

# For binary classification
# Split the data (80/20 split for binary classification)
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42, stratify=y_binary)

# Create the RandomForestClassifier with default parameters
rf_binary_default = RandomForestClassifier(random_state=42)

# Train and evaluate the model for binary classification
print("Binary Classification with Default Hyperparameters:")
train_and_evaluate(X_train, X_test, y_train, y_test, rf_binary_default)

# For multiclass classification
# Split the data (80/20 split for multiclass classification)
X_train, X_test, y_train, y_test = train_test_split(X, y_multiclass, test_size=0.2, random_state=42, stratify=y_multiclass)

# Create the RandomForestClassifier with default parameters
rf_multiclass_default = RandomForestClassifier(random_state=42)

# Train and evaluate the model for multiclass classification
print("\nMulticlass Classification with Default Hyperparameters:")
train_and_evaluate(X_train, X_test, y_train, y_test, rf_multiclass_default)



Binary Classification with Default Hyperparameters:
Accuracy: 0.9575

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.96     13054
           1       0.95      0.96      0.96     13786

    accuracy                           0.96     26840
   macro avg       0.96      0.96      0.96     26840
weighted avg       0.96      0.96      0.96     26840


Multiclass Classification with Default Hyperparameters:
Accuracy: 0.9205

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.91      0.92     13043
           1       0.92      0.93      0.92     13797

    accuracy                           0.92     26840
   macro avg       0.92      0.92      0.92     26840
weighted avg       0.92      0.92      0.92     26840

