In [10]:
'''Subtask 1'''
import pandas as pd
import numpy as np
import os


df = pd.read_json('./SubtaskA/datasets/subtaskA_train_monolingual.jsonl', lines=True)
# Just interested so far in text and label
df = df[['text', 'label']]

'So testing doesnt takes too much time processing, 10k seems ok'
df=df.sample(10000)

print('\nExample of dataframe (text|label)\n')
print(df.sample(5))
print(f'\nSize{df.shape}')
print('\nValue count\n')
print(df['label'].value_counts())


Example of dataframe (text|label)

                                                     text  label
58515    When you fall asleep, your body temperature n...      0
114540    A key challenge to deploying reinforcement l...      0
43617   \n\nThis paper addresses the question of how h...      1
29586   Young Dan’l Boone is a television series that ...      1
74753   Conal Holmes O'Connell O'Riordan (pseudonym No...      0

Size(10000, 2)

Value count

label
0    5204
1    4796
Name: count, dtype: int64


In [11]:
'''Quick model to test dataset with glove and RF/Dense/RNN'''

#Glove Embedding, we will try first with just 200 dimensions
"""Load the Glove vectors in a dictionay"""
# Download from here https://nlp.stanford.edu/projects/glove/
embeddings_index={}
with open(os.getcwd()+'/OtherData/glove.6B.200d.txt','r',encoding='utf-8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embeddings_index[word]=vectors
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [12]:
from imblearn.under_sampling import RandomUnderSampler

'''Balance the dataset so its easier to run'''

print(f'Dataset size before balancing: {df.shape}')
counts = df['label'].value_counts()

sampler = RandomUnderSampler(random_state=42)
x_text, y = sampler.fit_resample(df[['text']], df['label'])

print(f'Dataset size after balancing: {x_text.shape}')
print(f'Entried dropped: {df.shape[0]-x_text.shape[0]}')

# Create a new balanced DataFrame
balanced_df = pd.DataFrame({'text': x_text['text'], 'label': y})

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(balanced_df['label'].value_counts())

Dataset size before balancing: (10000, 2)
Dataset size after balancing: (9592, 1)
Entried dropped: 408

Balanced DataFrame:
label
0    4796
1    4796
Name: count, dtype: int64


In [13]:
'''Creating embedings'''

from nltk.tokenize import word_tokenize
from tqdm import tqdm 

def sent2vec(s):
    """ Function Creates a normalized vector for the whole sentence"""
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())

x_glove = np.array([sent2vec(x) for x in tqdm(x_text['text'])])
x_glove.shape


100%|██████████| 9592/9592 [00:14<00:00, 663.67it/s]


(9592, 200)

In [15]:
'''Preparing for training'''

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_glove, y, test_size=0.2, random_state=42)
# Initialize the StandardScaler
scaler = StandardScaler()
# Fit the scaler to the training data and transform the data
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)  # Note that we use the same scaler to transform the test data
# If this scaler is going to be used later on for prediction it must be saved, for example with pickle


In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, recall_score, precision_score, auc, f1_score
from transformers import Trainer, TrainingArguments

# Sample Data, replace these with actual numpy arrays of your data
import numpy as np

class MyDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return {
            'x': self.features[idx],  # changed 'features' to 'x'
            'label': self.labels[idx]  # changed 'labels' to 'label'
        }

# Verify the shape and type of the data
print(x_train_scaled.shape, y_train.shape, x_test_scaled.shape, y_test.shape)

# Convert numpy arrays to PyTorch tensors
x_train_scaled = torch.tensor(x_train_scaled, dtype=torch.float32)
x_test_scaled = torch.tensor(x_test_scaled, dtype=torch.float32)

y_train = torch.tensor(y_train.to_numpy(), dtype=torch.float32) if isinstance(y_train, pd.Series) else torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test.to_numpy(), dtype=torch.float32) if isinstance(y_test, pd.Series) else torch.tensor(y_test, dtype=torch.float32)


# Create dataset and dataloaders
train_dataset = MyDataset(x_train_scaled, y_train)
test_dataset = MyDataset(x_test_scaled, y_test)

# Add prints to check the length of datasets
print(len(train_dataset), len(test_dataset))

import torch.nn.functional as F

# PyTorch model
class MyModel(nn.Module):
    def __init__(self, input_dim):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 200)
        self.dropout1 = nn.Dropout(0.2)
        self.batchnorm1 = nn.BatchNorm1d(200)
        self.fc2 = nn.Linear(200, 100)
        self.dropout2 = nn.Dropout(0.2)
        self.batchnorm2 = nn.BatchNorm1d(100)
        self.fc3 = nn.Linear(100, 100)
        self.dropout3 = nn.Dropout(0.2)
        self.batchnorm3 = nn.BatchNorm1d(100)
        self.fc4 = nn.Linear(100, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, labels=None):
        x = F.relu(self.batchnorm1(self.dropout1(self.fc1(x))))
        x = F.relu(self.batchnorm2(self.dropout2(self.fc2(x))))
        x = F.relu(self.batchnorm3(self.dropout3(self.fc3(x))))
        logits = self.fc4(x)
        outputs = self.sigmoid(logits)

        if labels is not None:
            loss = F.binary_cross_entropy(outputs, labels.unsqueeze(-1))
            return loss, outputs
        return outputs

# Instantiate the model
model = MyModel(input_dim=x_train_scaled.shape[1])

# Create dataset and dataloaders
train_dataset = MyDataset(x_train_scaled, y_train)
test_dataset = MyDataset(x_test_scaled, y_test)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1000,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

torch.Size([7673, 200]) torch.Size([7673]) torch.Size([1919, 200]) torch.Size([1919])
7673 1919


  x_train_scaled = torch.tensor(x_train_scaled, dtype=torch.float32)
  x_test_scaled = torch.tensor(x_test_scaled, dtype=torch.float32)
  y_train = torch.tensor(y_train.to_numpy(), dtype=torch.float32) if isinstance(y_train, pd.Series) else torch.tensor(y_train, dtype=torch.float32)
  y_test = torch.tensor(y_test.to_numpy(), dtype=torch.float32) if isinstance(y_test, pd.Series) else torch.tensor(y_test, dtype=torch.float32)


In [26]:
trainer.train()

                                          
  0%|          | 0/120000 [00:43<?, ?it/s]           


{'loss': 0.6201, 'learning_rate': 4.979166666666667e-05, 'epoch': 4.17}


                                          376.55it/s][A
  0%|          | 0/120000 [00:44<?, ?it/s]           

{'loss': 0.5279, 'learning_rate': 4.958333333333334e-05, 'epoch': 8.33}


                                          
  0%|          | 0/120000 [00:45<?, ?it/s]            

{'loss': 0.4745, 'learning_rate': 4.937500000000001e-05, 'epoch': 12.5}


                                          
  0%|          | 0/120000 [00:47<?, ?it/s]            

{'loss': 0.4459, 'learning_rate': 4.9166666666666665e-05, 'epoch': 16.67}


                                          
  0%|          | 0/120000 [00:48<?, ?it/s]            

{'loss': 0.4264, 'learning_rate': 4.8958333333333335e-05, 'epoch': 20.83}


                                          
  0%|          | 0/120000 [00:50<?, ?it/s]            

{'loss': 0.4038, 'learning_rate': 4.875e-05, 'epoch': 25.0}


                                          
  0%|          | 0/120000 [00:51<?, ?it/s]            

{'loss': 0.3885, 'learning_rate': 4.854166666666667e-05, 'epoch': 29.17}


                                          
  0%|          | 0/120000 [00:52<?, ?it/s]            

{'loss': 0.376, 'learning_rate': 4.8333333333333334e-05, 'epoch': 33.33}


                                          
  0%|          | 0/120000 [00:54<?, ?it/s]            

{'loss': 0.3606, 'learning_rate': 4.8125000000000004e-05, 'epoch': 37.5}


                                          
  0%|          | 0/120000 [00:55<?, ?it/s]            

{'loss': 0.3512, 'learning_rate': 4.791666666666667e-05, 'epoch': 41.67}


                                          
  0%|          | 0/120000 [00:57<?, ?it/s]            


{'loss': 0.3341, 'learning_rate': 4.770833333333334e-05, 'epoch': 45.83}


                                           357.32it/s][A
  0%|          | 0/120000 [00:58<?, ?it/s]            

{'loss': 0.327, 'learning_rate': 4.75e-05, 'epoch': 50.0}


                                          
  0%|          | 0/120000 [00:59<?, ?it/s]            

{'loss': 0.3162, 'learning_rate': 4.7291666666666666e-05, 'epoch': 54.17}


                                          
  0%|          | 0/120000 [01:01<?, ?it/s]            

{'loss': 0.3054, 'learning_rate': 4.708333333333334e-05, 'epoch': 58.33}


                                          
  0%|          | 0/120000 [01:02<?, ?it/s]            

{'loss': 0.2991, 'learning_rate': 4.6875e-05, 'epoch': 62.5}


                                          
  0%|          | 0/120000 [01:03<?, ?it/s]            

{'loss': 0.2899, 'learning_rate': 4.666666666666667e-05, 'epoch': 66.67}


                                          
  0%|          | 0/120000 [01:05<?, ?it/s]            

{'loss': 0.281, 'learning_rate': 4.6458333333333335e-05, 'epoch': 70.83}


                                          
  0%|          | 0/120000 [01:06<?, ?it/s]            

{'loss': 0.271, 'learning_rate': 4.6250000000000006e-05, 'epoch': 75.0}


                                          
  0%|          | 0/120000 [01:07<?, ?it/s]            

{'loss': 0.2636, 'learning_rate': 4.604166666666666e-05, 'epoch': 79.17}


                                          
  0%|          | 0/120000 [01:09<?, ?it/s]            

{'loss': 0.2588, 'learning_rate': 4.5833333333333334e-05, 'epoch': 83.33}


                                          
  0%|          | 0/120000 [01:10<?, ?it/s]             

{'loss': 0.2526, 'learning_rate': 4.5625e-05, 'epoch': 87.5}


                                          
  0%|          | 0/120000 [01:12<?, ?it/s]             

{'loss': 0.2451, 'learning_rate': 4.541666666666667e-05, 'epoch': 91.67}


                                          
  0%|          | 0/120000 [01:13<?, ?it/s]             


{'loss': 0.2399, 'learning_rate': 4.520833333333334e-05, 'epoch': 95.83}


                                          , 369.50it/s][A
  0%|          | 0/120000 [01:15<?, ?it/s]             


{'loss': 0.2272, 'learning_rate': 4.5e-05, 'epoch': 100.0}


                                          , 380.27it/s][A
  0%|          | 0/120000 [01:16<?, ?it/s]             

{'loss': 0.2263, 'learning_rate': 4.4791666666666673e-05, 'epoch': 104.17}


                                          
  0%|          | 0/120000 [01:17<?, ?it/s]             

{'loss': 0.2209, 'learning_rate': 4.458333333333334e-05, 'epoch': 108.33}


                                          
  0%|          | 0/120000 [01:19<?, ?it/s]             

{'loss': 0.2146, 'learning_rate': 4.4375e-05, 'epoch': 112.5}


                                          
  0%|          | 0/120000 [01:20<?, ?it/s]             

{'loss': 0.2084, 'learning_rate': 4.4166666666666665e-05, 'epoch': 116.67}


                                          
  0%|          | 0/120000 [01:22<?, ?it/s]             

{'loss': 0.2025, 'learning_rate': 4.3958333333333336e-05, 'epoch': 120.83}


                                          
  0%|          | 0/120000 [01:23<?, ?it/s]             

{'loss': 0.1911, 'learning_rate': 4.375e-05, 'epoch': 125.0}


                                          
  0%|          | 0/120000 [01:25<?, ?it/s]             

{'loss': 0.1882, 'learning_rate': 4.354166666666667e-05, 'epoch': 129.17}




KeyboardInterrupt: 



In [14]:
'''Dense Network'''

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Activation, Dropout, BatchNormalization, SimpleRNN
from tensorflow.keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from tensorflow.keras.preprocessing import sequence, text
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import backend as K

def f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

model = Sequential()

model.add(Dense(200, input_dim=x_train_scaled.shape[1], activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(100, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(1))
model.add(Activation('sigmoid'))
# compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy','Recall','Precision','AUC',f1_score])
model.summary()

stop_early = EarlyStopping(monitor='val_loss', patience=20)
model.fit(x_train_scaled, y=y_train, batch_size=64, 
              epochs=1000, verbose=1, 
              validation_data=(x_test_scaled, y_test),
              callbacks=[stop_early])

metrics=model.evaluate(x_test_scaled, y_test)

loss, accuracy, recall, precision, auc, f1 = metrics

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"AUC: {auc}")
print(f"F1 Score: {f1}")

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 200)               40200     
                                                                 
 dropout_5 (Dropout)         (None, 200)               0         
                                                                 
 batch_normalization_5 (Bat  (None, 200)               800       
 chNormalization)                                                
                                                                 
 dense_6 (Dense)             (None, 100)               20100     
                                                                 
 dropout_6 (Dropout)         (None, 100)               0         
                                                                 
 batch_normalization_6 (Bat  (None, 100)               400       
 chNormalization)                                     

In [15]:
'''Random Forest'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score

# Train Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(x_train_scaled, y_train)

# Predict
y_pred = model.predict(x_test_scaled)
y_proba = model.predict_proba(x_test_scaled)[:, 1]  # Get the probability of class 1

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"AUC: {auc}")
print(f"F1 Score: {f1}")

Accuracy: 0.7535059331175836
Recall: 0.7406181015452539
Precision: 0.7513997760358343
AUC: 0.8421592803718296
F1 Score: 0.7459699833240689
