In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Import classifiers
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

# Load the dataset
df = pd.read_csv('train.csv')

# Mapping from metaphorID to the actual metaphor words
metaphor_mapping = {
    0: 'road',
    1: 'candle',
    2: 'light',
    3: 'spice',
    4: 'ride',
    5: 'train',
    6: 'boat'
}

# Preprocessing functions
def preprocess_with_tfidf(text_series, max_features=1000):
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
    return tfidf_vectorizer.fit_transform(text_series).toarray()

# One-hot encode the 'metaphor_word' column
def one_hot_encode(series):
    onehot_encoder = OneHotEncoder(sparse=False)
    return onehot_encoder.fit_transform(series.values.reshape(-1, 1))

# Combine features
def combine_features(*args):
    return np.hstack(args)

# Model training and evaluation function
def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    results = {}
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        results[name] = (report, accuracy)
        print(f"Model: {name}")
        print(report)
        print(f"Accuracy: {accuracy}\n")
    return results

# Apply the mapping to create a new 'metaphor_word' column
df['metaphor_word'] = df['metaphorID'].map(metaphor_mapping)

# Convert the 'label_boolean' column to integers
df['label_boolean'] = df['label_boolean'].astype(int)

# Feature extraction
X_tfidf = preprocess_with_tfidf(df['text'])
X_metaphor_word = one_hot_encode(df['metaphor_word'])

# Combine TF-IDF features with one-hot encoded metaphor_word features
X = combine_features(X_tfidf, X_metaphor_word)
y = df['label_boolean'].values

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of models to train
models = [
    ("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("AdaBoost", AdaBoostClassifier(n_estimators=100, random_state=42)),
    # Add other models as needed
]

# Train and evaluate each model
results = train_and_evaluate(models, X_train, y_train, X_test, y_test)


Model: Random Forest
              precision    recall  f1-score   support

           0       0.88      0.23      0.37        99
           1       0.78      0.99      0.87       275

    accuracy                           0.79       374
   macro avg       0.83      0.61      0.62       374
weighted avg       0.81      0.79      0.74       374

Accuracy: 0.7887700534759359

Model: AdaBoost
              precision    recall  f1-score   support

           0       0.72      0.55      0.62        99
           1       0.85      0.92      0.89       275

    accuracy                           0.82       374
   macro avg       0.78      0.73      0.75       374
weighted avg       0.82      0.82      0.82       374

Accuracy: 0.8235294117647058

