In [None]:
import os
import pandas as pd

In [None]:
# Create DataFrame
directory = '../Data/Coded/'

coded_data = pd.DataFrame()

# Iterate through the files
for root, dirs, files in os.walk(directory):
    for file in files:
        if file.endswith('.xlsx'):
            file_path = os.path.join(root, file)
            # Read in the xlsx file 
            df = pd.read_excel(file_path, usecols=['Paragraph', 'IC'])
            coded_data = pd.concat([coded_data, df], ignore_index=True)

In [None]:
# Train BERT
from tqdm import tqdm 
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, RandomSampler, TensorDataset

In [None]:
# Tokenize data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 512

input_ids = []
attention_masks = []

for text in coded_data['Paragraph']:
    encoded_text = tokenizer.encode_plus(text, 
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        truncation=True,
        return_tensors='pt'
    )
    input_ids.append(encoded_text['input_ids'])
    attention_masks.append(encoded_text['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(coded_data['IC'].values, dtype=torch.float)