In [None]:
import pickle
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from transformers import RobertaTokenizer, TFRobertaModel, BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.session import SparkSession

def upload_blob(bucket_name, source_file_name, destination_blob_name):
  """Uploads a file to the bucket."""
  storage_client = storage.Client()
  bucket = storage_client.get_bucket(bucket_name)
  blob = bucket.blob(destination_blob_name)

  blob.upload_from_filename(source_file_name)

  print('File {} uploaded to {}.'.format(
      source_file_name,
      destination_blob_name))


train_df = pd.read_csv("Downloads/train.csv")
test_df = pd.read_csv("Downloads/test.csv")
validation_df = pd.read_csv("Downloads/validation.csv")

# Define preprocessing function
def preprocess_data(df):
    df['topic_name'].fillna(df['topic_name'].mode()[0], inplace=True)
    df = df.dropna(subset=["question", "exp"])
    mode_values = df[['opa', 'opb', 'opc', 'opd']].mode().iloc[0]
    df[['opa', 'opb', 'opc', 'opd']] = df[['opa', 'opb', 'opc', 'opd']].fillna(mode_values)
    label_encoder = LabelEncoder()
    df['cop'] = label_encoder.fit_transform(df['cop'])
    return df

# Preprocess the data
train_df = preprocess_data(train_df)
validation_df = preprocess_data(validation_df)

# Split the data into features and target
X_train = train_df['question']
y_cop_train = train_df['cop']
y_exp_train = train_df['exp']
y_topic_name_train = train_df['topic_name']

X_val = validation_df['question']
y_cop_val = validation_df['cop']

#BERT
PRETRAINED_LM = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_LM, do_lower_case=True)
N_labels = len(train_df.cop.unique())
model = BertForSequenceClassification.from_pretrained(PRETRAINED_LM,
                                                      num_labels=N_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

# Tokenize and encode data for BERT
def encode(docs):
    encoded_dict = tokenizer.batch_encode_plus(docs, add_special_tokens=True, max_length=128, padding='max_length',
                                               return_attention_mask=True, truncation=True, return_tensors='pt')
    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']
    return input_ids, attention_masks

train_input_ids, train_att_masks = encode(X_train.values.tolist())
valid_input_ids, valid_att_masks = encode(X_val.values.tolist())

# Build DataLoader for BERT
BATCH_SIZE = 16
train_dataset = TensorDataset(train_input_ids, train_att_masks, torch.LongTensor(y_cop_train.values.tolist()))
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_dataset = TensorDataset(valid_input_ids, valid_att_masks, torch.LongTensor(y_cop_val.values.tolist()))
valid_sampler = SequentialSampler(valid_dataset)
valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=BATCH_SIZE)

# Move BERT model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Set up BERT optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=0.000005)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)



# Train BERT model
EPOCHS = 15
for epoch_num in range(EPOCHS):
    print('Epoch: ', epoch_num + 1)

    # Training
    model.train()
    for step_num, batch_data in enumerate(tqdm(train_dataloader, desc='Training')):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        output = model(input_ids=input_ids, attention_mask=att_mask, labels=labels)

        optimizer.zero_grad()
        loss = output.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Validation
    model.eval()
    valid_loss = 0
    valid_pred = []
    with torch.no_grad():
        for batch_data in tqdm(valid_dataloader, desc='Validation'):
            input_ids, att_mask, labels = [data.to(device) for data in batch_data]
            output = model(input_ids=input_ids, attention_mask=att_mask, labels=labels)
            valid_loss += output.loss.item()
            valid_pred.append(torch.argmax(output.logits.cpu().detach(), axis=-1))

    # Calculate and print losses
    train_loss = loss.item()
    valid_loss /= len(valid_dataloader)
    print(f'Training Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}')
