In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
import numpy as np

# Load the dataset
df = pd.read_csv('train.csv')

# Create a mapping from metaphorID to the actual metaphor words
metaphor_mapping = {
    0: 'road',
    1: 'candle',
    2: 'light',
    3: 'spice',
    4: 'ride',
    5: 'train',
    6: 'boat'
}

# Apply the mapping to the 'metaphorID' column to create a new 'metaphor_word' column
df['metaphor_word'] = df['metaphorID'].map(metaphor_mapping)

# Convert the 'label_boolean' column to integers (True to 1, False to 0)
df['label_boolean'] = df['label_boolean'].astype(int)

# Feature extraction using TF-IDF for the text
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(df['text']).toarray()

# One-hot encode the 'metaphor_word' column
onehot_encoder = OneHotEncoder(sparse=False)
X_metaphor_word = onehot_encoder.fit_transform(df[['metaphor_word']])

# Combine TF-IDF features with one-hot encoded metaphor_word features
X = np.hstack((X_tfidf, X_metaphor_word))
y = df['label_boolean'].values

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter grids for each model
param_grid = {
    "Random Forest": {"n_estimators": [100, 200, 300], "max_depth": [None, 10, 20]},
    "AdaBoost": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 1]},
    "Gradient Boosting": {"n_estimators": [100, 200, 300], "learning_rate": [0.01, 0.1, 1], "max_depth": [3, 5, 10]},
    "Extra Trees": {"n_estimators": [100, 200, 300], "max_depth": [None, 10, 20]},
    "XGBoost": {"n_estimators": [100, 200, 300], "learning_rate": [0.01, 0.1, 1], "max_depth": [3, 6, 10]}
}

# Models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Extra Trees": ExtraTreesClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

# Hyperparameter tuning
best_params = {}
for name in models:
    grid_search = GridSearchCV(models[name], param_grid[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_params[name] = grid_search.best_params_
    print(f"Best parameters for {name}: {best_params[name]}")




Best parameters for Random Forest: {'max_depth': None, 'n_estimators': 300}
Best parameters for AdaBoost: {'learning_rate': 0.1, 'n_estimators': 200}
Best parameters for Gradient Boosting: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200}
Best parameters for Extra Trees: {'max_depth': None, 'n_estimators': 100}




Best parameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300}
