In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from imblearn.over_sampling import SMOTE


# Load the dataset
df = pd.read_csv('train.csv')

# Create a mapping from metaphorID to the actual metaphor words
metaphor_mapping = {
    0: 'road', 1: 'candle', 2: 'light', 3: 'spice', 4: 'ride', 5: 'train', 6: 'boat'
}
df['metaphor_word'] = df['metaphorID'].map(metaphor_mapping)

# Convert the 'label_boolean' column to integers (True to 1, False to 0)
df['label_boolean'] = df['label_boolean'].astype(int)

# Feature extraction using TF-IDF for the text (fit on the full dataset)
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(df['text']).toarray()

# One-hot encode the 'metaphor_word' column
onehot_encoder = OneHotEncoder(sparse=False)
X_metaphor_word = onehot_encoder.fit_transform(df[['metaphor_word']])

# Combine TF-IDF features with one-hot encoded metaphor_word features
X = np.hstack((X_tfidf, X_metaphor_word))
y = df['label_boolean'].values

# Applying SMOTE for balancing the dataset
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Define models with best parameters
models = [
    ("Random Forest", RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced')),
    ("AdaBoost", AdaBoostClassifier(learning_rate=0.1, n_estimators=200, random_state=42)),
    ("Gradient Boosting", GradientBoostingClassifier(learning_rate=0.1, max_depth=10, n_estimators=200, random_state=42)),
    ("Extra Trees", ExtraTreesClassifier(n_estimators=100, random_state=42)),
    ("XGBoost", XGBClassifier(learning_rate=0.01, max_depth=6, n_estimators=300, random_state=42, use_label_encoder=False, eval_metric='logloss'))
]

# Perform cross-validation
cv = StratifiedKFold(n_splits=5)
for name, model in models:
    print(f"Model: {name}")
    accuracy_scores = []
    metrics = {'precision_0': [], 'recall_0': [], 'f1_0': [],
               'precision_1': [], 'recall_1': [], 'f1_1': []}

    for train_idx, test_idx in cv.split(X_resampled, y_resampled):
        X_train_cv, X_test_cv = X_resampled[train_idx], X_resampled[test_idx]
        y_train_cv, y_test_cv = y_resampled[train_idx], y_resampled[test_idx]

        model.fit(X_train_cv, y_train_cv)
        y_pred = model.predict(X_test_cv)

        # Metrics calculation for each class
        accuracy_scores.append(accuracy_score(y_test_cv, y_pred))
        precision, recall, f1, _ = precision_recall_fscore_support(y_test_cv, y_pred)
        
        # Storing metrics for each class
        metrics['precision_0'].append(precision[0])
        metrics['recall_0'].append(recall[0])
        metrics['f1_0'].append(f1[0])
        metrics['precision_1'].append(precision[1])
        metrics['recall_1'].append(recall[1])
        metrics['f1_1'].append(f1[1])

    # Calculate and print the mean of each metric for both classes
    mean_accuracy = np.mean(accuracy_scores)
    print(f"Mean Accuracy: {mean_accuracy}")
    for metric, values in metrics.items():
        print(f"Mean {metric}: {np.mean(values)}")
    print("\n")



Model: Random Forest
Mean Accuracy: 0.8914411940589952
Mean precision_0: 0.9418459092174694
Mean recall_0: 0.8293755025462343
Mean f1_0: 0.8651047722106664
Mean precision_1: 0.8794087043764863
Mean recall_1: 0.9532126410175191
Mean f1_1: 0.9073554411066436


Model: AdaBoost
Mean Accuracy: 0.8809559550397246
Mean precision_0: 0.918635363685517
Mean recall_0: 0.8336176019102848
Mean f1_0: 0.8636564538934218
Mean precision_1: 0.869059338828514
Mean recall_1: 0.9280597451329158
Mean f1_1: 0.8922683562459988


Model: Gradient Boosting
Mean Accuracy: 0.8952727028643259
Mean precision_0: 0.9238828955820029
Mean recall_0: 0.8615457713018688
Mean f1_0: 0.8797715806423685
Mean precision_1: 0.8952258525621115
Mean recall_1: 0.9287541726566116
Mean f1_1: 0.905241107284614


Model: Extra Trees
Mean Accuracy: 0.9190117038284578
Mean precision_0: 0.9474680050959247
Mean recall_0: 0.8881118881118881
Mean f1_0: 0.9091897008400129
Mean precision_1: 0.9123817651660622
Mean recall_1: 0.9496990814063985
Me