In [13]:
import os
import functools

import numpy as np
import pandas as pd
import torch

In [21]:
DATA_PATH = "./data/"

class Config:
    TRANSFORMER_CHECKPOINT = "allenai/longformer-base-4096"
    BATCH_SIZE = 2
    MAX_LENGTH = 1024
    STRIDE = 64    
    RANDOM_STATE = 42
    NUM_WORKERS = 2    
    NUM_FOLDS = 5
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [15]:
df_train = pd.read_csv(DATA_PATH + "train.csv")
df_train_orig = df_train.copy(deep = True)
df_train.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [16]:
df_train["predictionstring"] = df_train.predictionstring.apply(lambda str: [int(item) for item in str.split()])
df_train["discoursetype"] = df_train.loc[:, "discourse_type"]
df_train.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring,discoursetype
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...",Lead
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,"[45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 5...",Position
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,"[60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 7...",Evidence
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,"[76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 8...",Evidence
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,"[139, 140, 141, 142, 143, 144, 145, 146, 147, ...",Claim


In [17]:
df_train_onehot = pd.get_dummies(df_train, columns=["discoursetype"])
df_train_onehot = df_train_onehot.groupby(["id"], as_index=False).sum()
label_cols = [c for c in df_train_onehot.columns if c.startswith("discoursetype_") or c == "id"]
df_train_onehot = df_train_onehot[label_cols]
df_train_onehot.head()

Unnamed: 0,id,discoursetype_Claim,discoursetype_Concluding Statement,discoursetype_Counterclaim,discoursetype_Evidence,discoursetype_Lead,discoursetype_Position,discoursetype_Rebuttal
0,0000D23A521A,1,1,1,3,0,1,1
1,00066EA9880D,3,1,0,3,1,1,0
2,000E6DE9E817,5,1,1,3,0,1,1
3,001552828BD0,4,0,0,4,1,1,0
4,0016926B079C,7,0,0,3,0,1,0


In [18]:
def create_multilabel_targets(data_row, label_cols):
    targets = []
    for col in label_cols:
        targets.append(data_row[col])
    return targets

In [19]:
# For each essay, there can be multiple discourse_types, the target which is discourse type is thus multilabel
# For each essay this multilabel target column needs to be created first 

if "id" in label_cols:
    label_cols.remove("id")
df_train_onehot["targets"] = df_train_onehot.apply(lambda row: create_multilabel_targets(row, label_cols), axis=1)
df_train_onehot["targets_str"] = df_train_onehot.targets.apply(lambda x: ",".join([str(item) for item in x]))
df_train_onehot["kfold"] = -1
df_train_onehot.head()

Unnamed: 0,id,discoursetype_Claim,discoursetype_Concluding Statement,discoursetype_Counterclaim,discoursetype_Evidence,discoursetype_Lead,discoursetype_Position,discoursetype_Rebuttal,targets,targets_str,kfold
0,0000D23A521A,1,1,1,3,0,1,1,"[1, 1, 1, 3, 0, 1, 1]",1113011,-1
1,00066EA9880D,3,1,0,3,1,1,0,"[3, 1, 0, 3, 1, 1, 0]",3103110,-1
2,000E6DE9E817,5,1,1,3,0,1,1,"[5, 1, 1, 3, 0, 1, 1]",5113011,-1
3,001552828BD0,4,0,0,4,1,1,0,"[4, 0, 0, 4, 1, 1, 0]",4004110,-1
4,0016926B079C,7,0,0,3,0,1,0,"[7, 0, 0, 3, 0, 1, 0]",7003010,-1


In [22]:
import multilabel_stratsplit_utils as mss
df_train_onehot = mss.skml_multilabel_stratified_kfold_cv_split(df_train_onehot, label_cols, Config.NUM_FOLDS)
df_train_onehot.kfold.value_counts()

2    3122
3    3121
4    3121
0    3119
1    3111
Name: kfold, dtype: int64

In [23]:
df_stats = mss.get_train_val_split_stats(df_train_onehot, Config.NUM_FOLDS, label_cols)
df_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,"(6,)","(2,)","(5,)","(1,)","(0,)","(3,)","(4,)"
fold,counts,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,train_count,2921.0,3693.0,12288.0,10735.0,11941.0,12440.0,7441.0
0,val_count,677.0,883.0,3078.0,2683.0,2986.0,3110.0,1860.0
0,val_train_ratio,0.23177,0.239101,0.250488,0.24993,0.250063,0.25,0.249966
1,train_count,2853.0,3642.0,12298.0,10734.0,11942.0,12440.0,7434.0
1,val_count,745.0,934.0,3068.0,2684.0,2985.0,3110.0,1867.0
1,val_train_ratio,0.261129,0.256452,0.249471,0.250047,0.249958,0.25,0.251143
2,train_count,2898.0,3693.0,12300.0,10735.0,11942.0,12440.0,7436.0
2,val_count,700.0,883.0,3066.0,2683.0,2985.0,3110.0,1865.0
2,val_train_ratio,0.241546,0.239101,0.249268,0.24993,0.249958,0.25,0.250807
3,train_count,2838.0,3615.0,12284.0,10734.0,11941.0,12440.0,7455.0


In [24]:
from collections import defaultdict

ner_labels = df_train.discourse_type.unique().tolist()
labels = defaultdict()

for index, lbl in enumerate(ner_labels):
    labels[f"B-{lbl}"] = index
    labels[f"I-{lbl}"] = index + len(ner_labels)

labels[f"O"] = 2 * len(ner_labels)
labels[f"Special"] = -100

ids_to_labels = {value: key for key, value in labels.items()}

In [25]:
def read_text(file_path):
    with open(file_path, "r") as file:
        text = file.read()
        return text

In [26]:
df_train_grouped = df_train.groupby(["id"])
essay_id = pd.Series([*df_train_grouped.groups.keys()])
text = essay_id.apply(lambda id: read_text(f"{DATA_PATH}train/{id}.txt"))
df_text = pd.concat([essay_id, text], axis=1, keys=["id", "text"])
df_text["text_length"] = df_text.text.apply(lambda text: len(text.split()))
df_ner_labelslist = df_train_grouped["discourse_type"].apply(lambda x:list(x.sort_values())).reset_index(name="ner_labelslist")
df_discourse_start = df_train_grouped["discourse_start"].apply(list).reset_index(name="discourse_start")
df_discourse_end = df_train_grouped["discourse_end"].apply(list).reset_index(name="discourse_end")
df_predictionsstring = df_train_grouped["predictionstring"].apply(list).reset_index(name="predictionstring")
df_train_onehot = df_train_onehot[["id", "targets", "targets_str", "kfold"]]
df_list = [df_train_onehot, df_ner_labelslist, df_discourse_start, df_discourse_end, df_predictionsstring, df_text]
df_train_merged = functools.reduce(lambda df1, df2: pd.merge(left=df1, right=df2, on=["id"], how="inner"), df_list)
df_train_merged.head()

Unnamed: 0,id,targets,targets_str,kfold,ner_labelslist,discourse_start,discourse_end,predictionstring,text,text_length
0,0000D23A521A,"[1, 1, 1, 3, 0, 1, 1]",1113011,3,"[Claim, Concluding Statement, Counterclaim, Ev...","[0.0, 170.0, 358.0, 438.0, 627.0, 722.0, 836.0...","[170.0, 357.0, 438.0, 626.0, 722.0, 836.0, 101...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","Some people belive that the so called ""face"" o...",251
1,00066EA9880D,"[3, 1, 0, 3, 1, 1, 0]",3103110,0,"[Claim, Claim, Claim, Concluding Statement, Ev...","[0.0, 456.0, 638.0, 738.0, 1399.0, 1488.0, 231...","[455.0, 592.0, 738.0, 1398.0, 1487.0, 2219.0, ...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",Driverless cars are exaclty what you would exp...,646
2,000E6DE9E817,"[5, 1, 1, 3, 0, 1, 1]",5113011,3,"[Claim, Claim, Claim, Claim, Claim, Concluding...","[17.0, 64.0, 158.0, 310.0, 438.0, 551.0, 776.0...","[56.0, 157.0, 309.0, 422.0, 551.0, 775.0, 961....","[[2, 3, 4, 5, 6, 7, 8], [10, 11, 12, 13, 14, 1...",Dear: Principal\n\nI am arguing against the po...,274
3,001552828BD0,"[4, 0, 0, 4, 1, 1, 0]",4004110,4,"[Claim, Claim, Claim, Claim, Evidence, Evidenc...","[0.0, 161.0, 872.0, 958.0, 1191.0, 1542.0, 161...","[160.0, 872.0, 957.0, 1190.0, 1541.0, 1612.0, ...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",Would you be able to give your car up? Having ...,512
4,0016926B079C,"[7, 0, 0, 3, 0, 1, 0]",7003010,0,"[Claim, Claim, Claim, Claim, Claim, Claim, Cla...","[0.0, 58.0, 94.0, 206.0, 236.0, 272.0, 542.0, ...","[57.0, 91.0, 150.0, 235.0, 271.0, 542.0, 650.0...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, ...",I think that students would benefit from learn...,261


In [27]:
def label_words(row):    
    words = row["text"].split()
    word_labels = ["O" for word in words]
    word_label_ids = [labels["O"] for word in words]
    for idx, label in enumerate(row["ner_labelslist"]):
        word_idx = row["predictionstring"][idx]        
        # As per the NER IOB tagging scheme
        # The starting word of the discourse has label B-
        word_labels[word_idx[0]] = f"B-{label}"
        word_label_ids[word_idx[0]] = labels[f"B-{label}"]
        # All other words of the discourse have label I-
        for index in word_idx[1:]:
            word_labels[index] = f"I-{label}"
            word_label_ids[index] = labels[f"I-{label}"]        
    row["word_labels"] = word_labels
    row["word_label_ids"] = word_label_ids
    return row

In [28]:
df_train_final = df_train_merged.apply(lambda row: label_words(row), axis=1)
# numeric id to identify a unique essay
df_train_final["essay_id"] = pd.Series([item for item in range(len(df_train_final))])
df_train_final.head()

Unnamed: 0,id,targets,targets_str,kfold,ner_labelslist,discourse_start,discourse_end,predictionstring,text,text_length,word_labels,word_label_ids
0,0000D23A521A,"[1, 1, 1, 3, 0, 1, 1]",1113011,3,"[Claim, Concluding Statement, Counterclaim, Ev...","[0.0, 170.0, 358.0, 438.0, 627.0, 722.0, 836.0...","[170.0, 357.0, 438.0, 626.0, 722.0, 836.0, 101...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...","Some people belive that the so called ""face"" o...",251,"[B-Claim, I-Claim, I-Claim, I-Claim, I-Claim, ...","[3, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
1,00066EA9880D,"[3, 1, 0, 3, 1, 1, 0]",3103110,0,"[Claim, Claim, Claim, Concluding Statement, Ev...","[0.0, 456.0, 638.0, 738.0, 1399.0, 1488.0, 231...","[455.0, 592.0, 738.0, 1398.0, 1487.0, 2219.0, ...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",Driverless cars are exaclty what you would exp...,646,"[B-Claim, I-Claim, I-Claim, I-Claim, I-Claim, ...","[3, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
2,000E6DE9E817,"[5, 1, 1, 3, 0, 1, 1]",5113011,3,"[Claim, Claim, Claim, Claim, Claim, Concluding...","[17.0, 64.0, 158.0, 310.0, 438.0, 551.0, 776.0...","[56.0, 157.0, 309.0, 422.0, 551.0, 775.0, 961....","[[2, 3, 4, 5, 6, 7, 8], [10, 11, 12, 13, 14, 1...",Dear: Principal\n\nI am arguing against the po...,274,"[O, O, B-Claim, I-Claim, I-Claim, I-Claim, I-C...","[14, 14, 3, 10, 10, 10, 10, 10, 10, 14, 3, 10,..."
3,001552828BD0,"[4, 0, 0, 4, 1, 1, 0]",4004110,4,"[Claim, Claim, Claim, Claim, Evidence, Evidenc...","[0.0, 161.0, 872.0, 958.0, 1191.0, 1542.0, 161...","[160.0, 872.0, 957.0, 1190.0, 1541.0, 1612.0, ...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13...",Would you be able to give your car up? Having ...,512,"[B-Claim, I-Claim, I-Claim, I-Claim, I-Claim, ...","[3, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10..."
4,0016926B079C,"[7, 0, 0, 3, 0, 1, 0]",7003010,0,"[Claim, Claim, Claim, Claim, Claim, Claim, Cla...","[0.0, 58.0, 94.0, 206.0, 236.0, 272.0, 542.0, ...","[57.0, 91.0, 150.0, 235.0, 271.0, 542.0, 650.0...","[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [10, 11, 12, ...",I think that students would benefit from learn...,261,"[B-Claim, I-Claim, I-Claim, I-Claim, I-Claim, ...","[3, 10, 10, 10, 10, 10, 10, 10, 10, 10, 3, 10,..."


In [None]:
df_train_final.to_pickle("./data/df_train_final.pkl")