In [2]:
'''Import libraries'''
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import wandb
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import os
from torch.utils.data import Dataset
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW, Trainer, TrainingArguments
from tqdm import tqdm
from torch.nn import functional as F
import torch.nn as nn

wandb.login()

  from .autonotebook import tqdm as notebook_tqdm
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33malberto-rodero557[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
'''Variables and parameters'''

SAMPLES_TO_TRAIN=10000
DIMENSIONS=200

N_LABELS=2
MAX_LEN = 256
EPOCHS=50
PATIENCE=10
LEARNING_RATE=.00005
WEIGHT_DECAY=.01
BATCH_SIZE=16
METRIC_FOR_BEST_MODEL='eval_loss'
if METRIC_FOR_BEST_MODEL=='eval_loss':
    GREATER_IS_BETTER = False
else:
    GREATER_IS_BETTER = True

In [4]:
'''Preparing dataset'''

df = pd.read_json(os.getcwd()+'/datasets/subtaskA_train_monolingual.jsonl', lines=True)
df = df[['text', 'label']]

df=df.sample(round(SAMPLES_TO_TRAIN))
# test_train_df=df.sample(round(SAMPLES_TO_TRAIN*.2))

# df = pd.read_json(os.getcwd()+'/datasets/subtaskA_dev_monolingual.jsonl', lines=True)
# df = df[['text', 'label']]

# val_df= df.sample(round(SAMPLES_TO_TRAIN*.2))
# test_dev_df= df.sample(round(SAMPLES_TO_TRAIN*.2))

# we balance the training set
print(f'Dataset size before balancing: {df.shape}')
counts = df['label'].value_counts()
sampler = RandomUnderSampler(random_state=42)
x_text, y = sampler.fit_resample(df[['text']], df['label'])

print(f'Dataset size after balancing: {x_text.shape}')
print(f'Entried dropped: {df.shape[0]-x_text.shape[0]}')

# Create a new balanced DataFrame
df = pd.DataFrame({'text': x_text['text'], 'label': y})

# Print the balanced DataFrame
print("\nBalanced DataFrame:")
print(df['label'].value_counts())

Dataset size before balancing: (10000, 2)
Dataset size after balancing: (9624, 1)
Entried dropped: 376

Balanced DataFrame:
label
0    4812
1    4812
Name: count, dtype: int64


In [5]:
'''loading glove'''
embeddings_index={}
with open('../0 playground and indoor/OtherData/glove.6B.200d.txt','r',encoding='utf-8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embeddings_index[word]=vectors
f.close()
print('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


In [6]:
'''glove building'''

from nltk.tokenize import word_tokenize
from tqdm import tqdm 

def sent2vec(s):
    """ Function Creates a normalized vector for the whole sentence"""
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())

print('Training df:')
df_x = np.array([sent2vec(x) for x in tqdm(df['text'])])
print(df_x.shape)
train_y=df['label']


Training df:


100%|██████████| 9624/9624 [00:14<00:00, 670.22it/s] 

(9624, 200)





In [7]:
'''Preparing for training'''

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Initialize the StandardScaler
scaler = StandardScaler()
# Fit the scaler to the training data and transform the data
train_x = scaler.fit_transform(df_x)

import pickle

# Save the trained scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)


In [8]:
'''metrics'''

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'auc': auc,
        'precision': precision,
        'recall': recall,
    }

In [14]:
class Data(Dataset):
    def __init__(self, X_train, y_train):
        self.X = torch.from_numpy(X_train.astype(np.float32))
        self.y = torch.from_numpy(y_train).type(torch.LongTensor)
        self.len = self.X.shape[0]

    def __getitem__(self, index):
        return {'input_ids': self.X[index], 'labels': self.y[index]}

    def __len__(self):
        return self.len

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y.values, test_size=0.2, random_state=42)
traindata = Data(X_train, y_train)
testdata = Data(X_test, y_test)

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score

# Train Random Forest
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Compute Metrics
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

metrics = {
    'accuracy': acc,
    'f1': f1,
    'auc': auc,
    'precision': precision,
    'recall': recall,
}

print(metrics)

{'accuracy': 0.7407792207792208, 'f1': 0.7402394586153046, 'auc': 0.7410130401354551, 'precision': 0.7277379733879222, 'recall': 0.753177966101695}


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a base model
rf = RandomForestClassifier(random_state=42)

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='f1')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters
print(grid_search.best_params_)

# Train and predict using the best model
best_grid = grid_search.best_estimator_
y_pred = best_grid.predict(X_test)


Fitting 3 folds for each of 144 candidates, totalling 432 fits
{'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, roc_auc_score

# Train Random Forest
clf = RandomForestClassifier(n_estimators=200, random_state=42, min_samples_split=10,min_samples_leaf=2,max_depth=None)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Compute Metrics
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

metrics = {
    'accuracy': acc,
    'f1': f1,
    'auc': auc,
    'precision': precision,
    'recall': recall,
}

print(metrics)

{'accuracy': 0.758961038961039, 'f1': 0.7615621788283659, 'auc': 0.7594512906235422, 'precision': 0.7395209580838323, 'recall': 0.784957627118644}


In [None]:
# first LSTM
# 'eval_loss': 0.4162973463535309, 'eval_accuracy': 0.8286620835536753, 'eval_f1': 0.8205980066445182
