In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Importing models
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier


# Load the dataset
df = pd.read_csv('train.csv')

# Create a mapping from metaphorID to the actual metaphor words
metaphor_mapping = {
    0: 'road',
    1: 'candle',
    2: 'light',
    3: 'spice',
    4: 'ride',
    5: 'train',
    6: 'boat'
}

# Apply the mapping to the 'metaphorID' column to create a new 'metaphor_word' column
df['metaphor_word'] = df['metaphorID'].map(metaphor_mapping)

# Convert the 'label_boolean' column to integers (True to 1, False to 0)
df['label_boolean'] = df['label_boolean'].astype(int)

# Feature extraction using TF-IDF for the text
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(df['text']).toarray()

# One-hot encode the 'metaphor_word' column
onehot_encoder = OneHotEncoder(sparse=False)
X_metaphor_word = onehot_encoder.fit_transform(df[['metaphor_word']])

# Combine TF-IDF features with one-hot encoded metaphor_word features
X = np.hstack((X_tfidf, X_metaphor_word))
y = df['label_boolean'].values

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of models to train
models = [
    ("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("AdaBoost", AdaBoostClassifier(n_estimators=100, random_state=42)),
    ("Gradient Boosting", GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ("Extra Trees", ExtraTreesClassifier(n_estimators=100, random_state=42)),
    ("XGBoost", XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss'))
]

# Train and evaluate each model
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}\n")

# Print results
print(models)



Model: Random Forest
              precision    recall  f1-score   support

           0       0.88      0.23      0.37        99
           1       0.78      0.99      0.87       275

    accuracy                           0.79       374
   macro avg       0.83      0.61      0.62       374
weighted avg       0.81      0.79      0.74       374

Accuracy: 0.7887700534759359

Model: AdaBoost
              precision    recall  f1-score   support

           0       0.72      0.55      0.62        99
           1       0.85      0.92      0.89       275

    accuracy                           0.82       374
   macro avg       0.78      0.73      0.75       374
weighted avg       0.82      0.82      0.82       374

Accuracy: 0.8235294117647058

Model: Gradient Boosting
              precision    recall  f1-score   support

           0       0.88      0.36      0.51        99
           1       0.81      0.98      0.89       275

    accuracy                           0.82       374
   mac