In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Load the dataset
df = pd.read_csv('train.csv')

# Create a mapping from metaphorID to the actual metaphor words
metaphor_mapping = {
    0: 'road',
    1: 'candle',
    2: 'light',
    3: 'spice',
    4: 'ride',
    5: 'train',
    6: 'boat'
}

# Apply the mapping to the 'metaphorID' column to create a new 'metaphor_word' column
df['metaphor_word'] = df['metaphorID'].map(metaphor_mapping)

# Convert the 'label_boolean' column to integers (True to 1, False to 0)
df['label_boolean'] = df['label_boolean'].astype(int)

# Tokenize the text
df['tokenized'] = df['text'].apply(word_tokenize)

# Train a Word2Vec model or load a pre-trained one
# For this example, we'll train a new model on the dataset
word2vec_model = Word2Vec(sentences=df['tokenized'], vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.train(df['tokenized'], total_examples=len(df['tokenized']), epochs=10)

# Create a function to vectorize a sentence based on the word vectors
def vectorize_sentence(sentence, model):
    vector = np.zeros(model.vector_size)
    num_words = 0
    for word in sentence:
        if word in model.wv:
            vector += model.wv[word]
            num_words += 1
    if num_words > 0:
        vector /= num_words
    return vector

# Vectorize each sentence in the dataset
X_word2vec = np.array([vectorize_sentence(sent, word2vec_model) for sent in df['tokenized']])

# One-hot encode the 'metaphor_word' column
onehot_encoder = OneHotEncoder(sparse=False)
X_metaphor_word = onehot_encoder.fit_transform(df[['metaphor_word']])

# Combine Word2Vec features with one-hot encoded metaphor_word features
X = np.hstack((X_word2vec, X_metaphor_word))
y = df['label_boolean'].values

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model selection and training
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)

# Predicting the test set results
y_pred = classifier.predict(X_test)

# Evaluation of the results
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ayushlodha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


              precision    recall  f1-score   support

           0       0.75      0.27      0.40        99
           1       0.79      0.97      0.87       275

    accuracy                           0.78       374
   macro avg       0.77      0.62      0.63       374
weighted avg       0.78      0.78      0.74       374

Accuracy: 0.7834224598930482
