## Get data and perform an initial clear

In [2]:
from Models import FileManagement
from Models import DataManagement
import pandas as pd
import warnings
from transformers import AutoTokenizer
import numpy as np

# suppress pandas warnings
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

kati = "/content/drive/MyDrive/Σχολή/Επιλογής/Υπολογιστική Νοημοσύνη/"

file_path :str = "data/iphi2802.csv"

data :pd.DataFrame = FileManagement.Import.data(file_path=file_path).from_csv()

# remove unnecessary columns
DataManagement.ClearData().from_columns(data, column_names=['metadata', 'id', 'region_main', 'region_sub', 'date_str', 'date_circa'])
# remove whitespace
DataManagement.ClearData().from_whitespace(data, column_names=['text'])
# make all characters uppercase
DataManagement.ClearData().from_uppercase_letters(data, column_names=['text'])
# remove non needing characters
DataManagement.ClearData().from_characters(data, r_characters=['\[', '\]', '-', '\.'], character='', column_names=["text"])
# remove single characters words
DataManagement.ClearData().from_single_characters(data, column_names=['text'])

data["text"] = data["text"].dropna()
print("Initial clear done and bert tokenized model is ready")

Initial clear done and bert tokenized model is ready


## Generate BERT Tokenized Text

In [None]:
# split text into words
data["words"] = DataManagement.Preprocess().split_column_to_words(data["text"])
data['words_length'] = data['words'].apply(lambda x: len(x))
data = data[data["words_length"] > 0]

# drop text column
DataManagement.ClearData().from_columns(dataframe=data, column_names=["text"])

# Initialize BERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained("pranaydeeps/Ancient-Greek-BERT")
text_list = data["words"].to_list()

MAX_LENGTH = 4469

# Tokenize Texts
encoded_text = []
for text in text_list:
    encoded_text.append(
        tokenizer.encode(
            text,
            add_special_tokens=True,
            is_split_into_words=True,
            padding='max_length',
            max_length=MAX_LENGTH,
            return_tensors="np"
        )[0]
    )

final_text = np.stack(encoded_text, axis=0)
# create the norm dataset
norm_dataset = np.hstack((final_text, data[['region_main_id', 'region_sub_id', 'date_min', 'date_max']].to_numpy()))
print("Bert Tokenization done!")

In [3]:
# X_y = [text_vectorized_0, ..., text_vectorized_n, region_main_id, region_sub_id, date_min, date_max]
X_y = DataManagement.Preprocess().MinMaxScaler(norm_dataset)

X, y = DataManagement.Preprocess().slice_from_end_2d_array(X_y, 2)
print("Data split into X and y")

Data split into X and y


# Create and Test Model

In [None]:
from Models import PyTorch_NN
import torch.nn as nn
from collections import OrderedDict

INPUT_NODES = np.shape(X)[1]
OUTPUT_NODES = 2
HIDDEN_LAYER_NODES: int = int((2 * INPUT_NODES) - (INPUT_NODES/8))
R_IN = 0.5
R_H = 0.5
multilayer_dropout_model = PyTorch_NN.PyTorchModel(
    hidden_layer_nodes=HIDDEN_LAYER_NODES,
    epochs=200,
    batch_size=16,
    X=X, y=y,
    momentum=0.2,
    learning_rate=0.001,
    with_dropout=True,
    r_in=R_IN,
    r_h=R_H
)
multilayer_dropout_model.model = nn.Sequential(OrderedDict([
    ('dropoutIn', nn.Dropout(R_IN).cuda()),
    ('dense1', nn.Linear(np.shape(X)[1], HIDDEN_LAYER_NODES).cuda()),
    ('act1', nn.ReLU().cuda()),
    ('dropoutH1', nn.Dropout(R_H).cuda()),
    ('dense2', nn.Linear(HIDDEN_LAYER_NODES, int(HIDDEN_LAYER_NODES/2)).cuda()),
    ('act2', nn.ReLU().cuda()),
    ('dropoutH2', nn.Dropout(R_H).cuda()),
    ('output', nn.Linear(int(HIDDEN_LAYER_NODES/2), 2).cuda()),
    ('outAct', nn.Sigmoid().cuda()),
]))

multilayer_dropout_model.print_model()

In [None]:
multilayer_dropout_model.train_test(with_early_stopping=False)