In [4]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
import numpy as np

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Project/train.csv')

# Create a mapping from metaphorID to the actual metaphor words
metaphor_mapping = {
    0: 'road', 1: 'candle', 2: 'light', 3: 'spice', 4: 'ride', 5: 'train', 6: 'boat'
}
df['metaphor_word'] = df['metaphorID'].map(metaphor_mapping)

# Convert the 'label_boolean' column to integers (True to 1, False to 0)
df['label_boolean'] = df['label_boolean'].astype(int)

# Split the dataset
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df['text'], df['label_boolean'], test_size=0.2, random_state=42, stratify=df['label_boolean'])

# Tokenize the text
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(X_train_raw)
X_train_seq = tokenizer.texts_to_sequences(X_train_raw)
X_test_seq = tokenizer.texts_to_sequences(X_test_raw)

# Pad sequences
max_len = max([len(x) for x in X_train_seq]) # You can also choose a fixed max length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Oversample using SMOTE
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_pad, y_train)

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=64, input_length=max_len))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_resampled, y_train_resampled, epochs=5, batch_size=64)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss: 0.593923807144165
Accuracy: 0.7459893226623535


In [5]:
from sklearn.metrics import classification_report
# Predict the labels for the test set
y_pred = model.predict(X_test_pad)

# Convert probabilities to binary format
y_pred_binary = (y_pred > 0.5).astype('int32')

# Generate a classification report
report = classification_report(y_test, y_pred_binary, target_names=['Class 0', 'Class 1'])
print(report)


              precision    recall  f1-score   support

     Class 0       0.46      0.48      0.47        88
     Class 1       0.84      0.83      0.83       286

    accuracy                           0.75       374
   macro avg       0.65      0.65      0.65       374
weighted avg       0.75      0.75      0.75       374



In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
