In [1]:
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
import pandas as pd
from copy import deepcopy
from torch.utils.data import DataLoader, Dataset
import numpy as np
import torch

In [3]:
cleaned_train_context = pd.read_csv("../../input/cleaned_train_context_5fold.csv", lineterminator="\n")

In [4]:
cleaned_train_context.shape

(615170, 19)

In [5]:
cleaned_train_context

Unnamed: 0,topics_ids,content_ids,channel,topic_title,topic_description,topic_parent_title,topic_parent_description,topic_child_title,topic_child_description,topic_category,topic_language,content_title,content_description,content_text,content_kind,content_language,target,topic_fold,content_fold
0,t_3d9ad9931021,c_efb73ad83f4b,ebc86c,,BC Introduction Human Biology Grewal,Butte College [SEP] Campus Courses [SEP] Libre...,,Text [SEP] Introduction to Human Biology [SEP]...,,supplemental,en,,,Orientaciones profesorado Orientaciones profes...,document,es,0,3.0,2.0
1,t_3d9ad9931021,c_77574ef20c1f,ebc86c,,BC Introduction Human Biology Grewal,Butte College [SEP] Campus Courses [SEP] Libre...,,Text [SEP] Introduction to Human Biology [SEP]...,,supplemental,en,,Recurso Educativo Abierto Ciencias Naturales S...,¿Es magia No reacciones químicas ¿Es magia No ...,document,es,0,3.0,2.0
2,t_3d9ad9931021,c_200ae87baf4d,ebc86c,,BC Introduction Human Biology Grewal,Butte College [SEP] Campus Courses [SEP] Libre...,,Text [SEP] Introduction to Human Biology [SEP]...,,supplemental,en,,Recurso Educativo Abierto Geografía Historia S...,economía economía Obra publicada Licencia Crea...,document,es,0,3.0,2.0
3,t_3d9ad9931021,c_87e171afe50b,ebc86c,,BC Introduction Human Biology Grewal,Butte College [SEP] Campus Courses [SEP] Libre...,,Text [SEP] Introduction to Human Biology [SEP]...,,supplemental,en,,,curiosidad saludable curiosidad saludable OMS ...,document,es,0,3.0,2.0
4,t_3d9ad9931021,c_3c070b63a944,ebc86c,,BC Introduction Human Biology Grewal,Butte College [SEP] Campus Courses [SEP] Libre...,,Text [SEP] Introduction to Human Biology [SEP]...,,supplemental,en,,,Palabra caballero Palabra caballero Por nobles...,document,es,0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
615165,t_70da08637930,c_eb6448437b5f,c7ca13,dot cross x diagrams illustrate bonding electr...,dot cross x diagrams illustrate bonding electr...,The role of outer electrons in chemical bondin...,The role of outer electrons in chemical bondin...,Use dot and cross x diagrams to illustrate bon...,Materials in this folder have been reviewed by...,aligned,en,Level formation ionic bond electrovalent bond,,,exercise,en,0,1.0,5.0
615166,t_70da08637930,c_07c1da15995b,c7ca13,dot cross x diagrams illustrate bonding electr...,dot cross x diagrams illustrate bonding electr...,The role of outer electrons in chemical bondin...,The role of outer electrons in chemical bondin...,Use dot and cross x diagrams to illustrate bon...,Materials in this folder have been reviewed by...,aligned,en,Level formation ionic bond electrovalent bond,,,exercise,en,0,1.0,5.0
615167,t_70da08637930,c_17ff16d31106,c7ca13,dot cross x diagrams illustrate bonding electr...,dot cross x diagrams illustrate bonding electr...,The role of outer electrons in chemical bondin...,The role of outer electrons in chemical bondin...,Use dot and cross x diagrams to illustrate bon...,Materials in this folder have been reviewed by...,aligned,en,Level formation ionic bond electrovalent bond,,,exercise,en,0,1.0,5.0
615168,t_70da08637930,c_7cb9a57f2219,c7ca13,dot cross x diagrams illustrate bonding electr...,dot cross x diagrams illustrate bonding electr...,The role of outer electrons in chemical bondin...,The role of outer electrons in chemical bondin...,Use dot and cross x diagrams to illustrate bon...,Materials in this folder have been reviewed by...,aligned,en,Ligações iônicas covalentes metálicas,Existem três tipos principais ligações química...,,video,pt,0,1.0,4.0


In [4]:
tmp = deepcopy(cleaned_train_context)

In [5]:
num_cols = ["target", "topic_fold", "content_fold"]
obj_cols = [c for c in tmp.columns if c not in num_cols]

tmp[obj_cols] = tmp[obj_cols].fillna("")
tmp[num_cols] = tmp[num_cols].fillna(-1)

In [6]:
curr_text_cols = [
     "topic_title", 
     "topic_description", 
     "content_title", 
     "content_description", 
     "content_text"
]

context_text_cols = [
    "topic_parent_title", 
    "topic_parent_description", 
    "topic_child_title", 
    "topic_child_description"
]

In [7]:
def trim_length(x, is_context=False, cap_char_len=150):
    if not is_context:
        x = x[:cap_char_len]
    else:
        "[SEP]".join([i[:cap_char_len] for i in x.split("[SEP]")])
    return x

for col in tqdm(curr_text_cols, position=0, leave=True, total=len(curr_text_cols)):
    tmp[col] = tmp[col].apply(trim_length, cap_char_len=150)
    
for col in tqdm(context_text_cols, position=0, leave=True, total=len(context_text_cols)):
    tmp[col] = tmp[col].apply(trim_length, cap_char_len=150)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
def concatenate_str(x):
    x = list(filter(None, x.values))
    x = " [SEP] ".join(x)
    x = x.replace("[SEP] [SEP]", "[SEP]")  # Remove doubles.
        
    if not x:
        x = "Description does not exist"
        
    return x

tmp["text"] = tmp[curr_text_cols[:-1]+context_text_cols].apply(concatenate_str, axis=1)

In [9]:
tokenizer = AutoTokenizer.from_pretrained("../../input/model23/"+'tokenizer/')

def get_max_length(train):
    lengths = []
    for text in tqdm(train['text'].fillna("").values, total = len(train)):
        length = len(tokenizer(text, add_special_tokens = False)['input_ids'])
        lengths.append(length)
    max_len = max(lengths) + 2 # cls & sep
    print(f"max_len: {max_len}")
    return max_len, lengths

In [10]:
max_len, lengths = get_max_length(tmp)

  0%|          | 0/615170 [00:00<?, ?it/s]

max_len: 650


In [28]:
tmp.to_csv("../../input/prep_cleaned_train_context_5fold.csv", index=False)