In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv('train.csv')

# Create a mapping from metaphorID to the actual metaphor words
metaphor_mapping = {
    0: 'road', 1: 'candle', 2: 'light', 3: 'spice', 4: 'ride', 5: 'train', 6: 'boat'
}
df['metaphor_word'] = df['metaphorID'].map(metaphor_mapping)

# Convert the 'label_boolean' column to integers (True to 1, False to 0)
df['label_boolean'] = df['label_boolean'].astype(int)

# Separate the dataset by classes
df_true = df[df['label_boolean'] == 1]
df_false = df[df['label_boolean'] == 0]

# Downsample the 'True' class and Upsample the 'False' class
df_true_downsampled = df_true.sample(n=800, random_state=42)
df_false_upsampled = df_false.sample(n=400, replace=True, random_state=42)

# Combine the resampled datasets
df_resampled = pd.concat([df_true_downsampled, df_false_upsampled])

# Splitting the dataset into training and testing sets
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df_resampled[['text', 'metaphor_word']], df_resampled['label_boolean'], test_size=0.2, random_state=42, stratify=df_resampled['label_boolean'])

# Feature extraction using TF-IDF for the text (fit on training data only)
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_raw['text']).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test_raw['text']).toarray()

# One-hot encode the 'metaphor_word' column
onehot_encoder = OneHotEncoder(sparse=False)
X_train_metaphor = onehot_encoder.fit_transform(X_train_raw[['metaphor_word']])
X_test_metaphor = onehot_encoder.transform(X_test_raw[['metaphor_word']])

# Combine TF-IDF features with one-hot encoded metaphor_word features
X_train = np.hstack((X_train_tfidf, X_train_metaphor))
X_test = np.hstack((X_test_tfidf, X_test_metaphor))

# Define models with best parameters
models = [
    ("Random Forest", RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced')),
    ("AdaBoost", AdaBoostClassifier(learning_rate=0.1, n_estimators=200, random_state=42)),
    ("Gradient Boosting", GradientBoostingClassifier(learning_rate=0.1, max_depth=10, n_estimators=200, random_state=42)),
    ("Extra Trees", ExtraTreesClassifier(n_estimators=100, random_state=42)),
    ("XGBoost", XGBClassifier(learning_rate=0.01, max_depth=6, n_estimators=300, random_state=42, use_label_encoder=False, eval_metric='logloss'))
]

# Train and evaluate each model
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Model: {name}")
    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}\n")




Model: Random Forest
              precision    recall  f1-score   support

           0       0.93      0.51      0.66        80
           1       0.80      0.98      0.88       160

    accuracy                           0.82       240
   macro avg       0.87      0.75      0.77       240
weighted avg       0.84      0.82      0.81       240

Accuracy: 0.825

Model: AdaBoost
              precision    recall  f1-score   support

           0       0.78      0.56      0.65        80
           1       0.81      0.92      0.86       160

    accuracy                           0.80       240
   macro avg       0.79      0.74      0.76       240
weighted avg       0.80      0.80      0.79       240

Accuracy: 0.8

Model: Gradient Boosting
              precision    recall  f1-score   support

           0       0.84      0.65      0.73        80
           1       0.84      0.94      0.89       160

    accuracy                           0.84       240
   macro avg       0.84      0.79 

  if is_sparse(data):


Model: XGBoost
              precision    recall  f1-score   support

           0       0.82      0.53      0.64        80
           1       0.80      0.94      0.87       160

    accuracy                           0.80       240
   macro avg       0.81      0.73      0.75       240
weighted avg       0.81      0.80      0.79       240

Accuracy: 0.8041666666666667



In [2]:
df_resampled

Unnamed: 0,metaphorID,label_boolean,text,metaphor_word
155,0,1,Hi - it 's like a century since I was going th...,road
1559,2,1,"hey kayla , i 'm new , my name is cherie , alt...",light
599,0,1,"Redheadace , i can identify with you too . I w...",road
456,0,1,I am early stage and on hormonals . I was prem...,road
1383,0,1,I 'm having neoadjuvany chemo & surgery in ear...,road
...,...,...,...,...
688,0,0,Well we have been to our second opinion consul...,road
221,2,0,I have just come back from Paris -LRB- I have ...,light
1572,4,0,"Wow , I am really happy to read how well every...",ride
12,2,0,And this is why I post here ... because it is ...,light
