In [1]:
!pip install datasets transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Using cached datasets-2.10.1-py3-none-any.whl (469 kB)
Collecting transformers[sentencepiece]
  Using cached transformers-4.26.1-py3-none-any.whl (6.3 MB)
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.2.0
  Using cached huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess
  Using cached multiprocess-0.70.14-py38-none-any.whl (132 kB)
Collecting responses<0.19
  Using cached responses-0.18.0-py3-none-any.whl (38 kB)
Collectin

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from transformers import AutoTokenizer

In [4]:
# ===============================================================
#  DataLoading
# ===============================================================

def check_contains(row):
    """Check whether the content_id is included in the label. Set to 1 if included, 0 if not."""
    try: # row["content_ids"]が空でないとき
        ground_truth = set(row["content_ids"])
        pred = row["predictions"]
        if pred in ground_truth:
            return 1
        else:
            return 0
    except: # row["content_ids"]が空の時
        return 0
    

def determine(df_topics):
    """
    Determine if the extracted content is correct
    """
    df = df_topics[["topic_id", "predictions"]]
    df.loc[:, "predictions"] = df["predictions"].str.split()
    df = df.explode("predictions", ignore_index=True)
    df = pd.merge(df,
                    df_topics[["topic_id", "content_ids"]],
                    on="topic_id", how="left")
    df.loc[:, "content_ids"] = df["content_ids"].apply(lambda x: x.split())
    tqdm.pandas()
    df.loc[:, "target"] = df.progress_apply(check_contains, axis=1)
    df.drop("content_ids", axis=1, inplace=True)
    return df


def tokenize(cfg, text, text_pair):
    """encode to sort by token_length"""
    inputs = cfg.tokenizer.encode_plus(
        text,
        text_pair,
        return_tensors = None, 
        add_special_tokens = True, 
        # pad_to_max_length = True,
        max_length = cfg.max_len,
        truncation = True
    )["input_ids"]
    return inputs


def prepare_df(cfg, df_corr, df_topics, df_content, data_type):
    print(f"{data_type} loading...")
    if data_type == "train":
        path = cfg.input_dir+f"train_top_50_ver2.pkl"
    elif data_type == "validation":
        path = cfg.input_dir+f"validation_top_50.pkl"

    with open(path, "rb") as f:
        loaded_list = pickle.load(f)
    df = pd.DataFrame.from_dict(loaded_list, orient='index').reset_index()
    df.rename(columns={"index":"topic_id"}, inplace=True)

    if cfg.debug:
        df = df.sample(n=2, random_state=42).reset_index(drop=True)

    df["predictions"] = df.apply(
    lambda x: " ".join([str(val) for idx, val in enumerate(x) if pd.notna(val) and idx != df.columns.get_loc("topic_id")]), axis=1
    )
    df = df[["topic_id", "predictions"]]
    df = pd.merge(df, df_corr[["topic_id", "content_ids"]], on="topic_id", how="left")

    if data_type == "train":
        df["predictions"] = df["predictions"] + " " + df["content_ids"] # trainのときのみ正例追加
        df["predictions"] = df["predictions"].str.split().apply(set).str.join(" ")

    df = determine(df)

    df = pd.merge(df, df_topics[["topic_id", "topic_sentence", "topic_language"]], on="topic_id", how="left")
    df = pd.merge(df, df_content[["id", "content_sentence", "content_language"]].rename(columns={"id":"predictions"}),
                  on="predictions", how="left")
    
    print(f'Input Example[0]: \n    topic  {df["topic_sentence"].values[0]}\n content  {df["content_sentence"].values[0]}')
    print(f'Input Example[1]: \n    topic  {df["topic_sentence"].values[1]}\n content  {df["content_sentence"].values[1]}')


    if data_type == "validation":
        # sort
        df['tokenize_length'] = [len(tokenize(cfg, text, text_pair)) for text, text_pair in \
                                 zip(
                                     tqdm(df['topic_sentence'].values, desc="encode sentence to sort by token length"),\
                                     df["content_sentence"].values
                                     )
                                 ]
        df = df.sort_values('tokenize_length', ascending=True).reset_index(drop=True)
    return df


def load_data(cfg):
    df_topics = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/topics.csv").fillna({"title":"", "description":""})
    df_content = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/content.csv").fillna({"title":" ", "description":"", "text":""})

    df_corr = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/correlations.csv")

    # language
    df_topics["topic_language"] = df_topics["language"]
    df_content["content_language"] = df_content["language"]

    # content sentence
    df_content["content_sentence"] = df_content["title"] + cfg.tokenizer.sep_token + df_content["description"]

    # topic sentence
    df_topics["topic_sentence"] = df_topics["title"] + cfg.tokenizer.sep_token +  df_topics["description"] +\
    cfg.tokenizer.sep_token + df_topics["context"]

    df_topics["topic_sentence"] = df_topics["topic_sentence"].str.replace(" >> ",  " ")
    df_topics = pd.merge(df_topics, df_corr, on="topic_id", how="left")

    df_train = prepare_df(cfg, df_corr, df_topics, df_content, data_type="train")
    df_valid = prepare_df(cfg, df_corr, df_topics, df_content, data_type="validation")

    print("train: \n", df_train["target"].value_counts())
    print("valid: \n", df_valid["target"].value_counts())
    return df_train, df_valid


def tokenizer(cfg):
    cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.model, is_fast=True)
    cfg.tokenizer.save_pretrained(cfg.output_dir+'tokenizer/')
    return 

In [5]:
class CFG:
    max_len = 256
    debug = False
    input_dir = "/content/drive/MyDrive/KAGGLE-LECR/last_data/1st/exp006/fold0/"
    model = "/content/drive/MyDrive/KAGGLE-LECR/last_data/1st/exp006/fold0/sentence-transformers-all-mpnet-base-v2_fine-tuned"
    output_dir = ""

In [6]:
tokenizer(CFG)

In [7]:
import pickle
from tqdm.auto import tqdm

In [8]:
df_train, df_valid = load_data(CFG)

train loading...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


  0%|          | 0/2741517 [00:00<?, ?it/s]

Input Example[0]: 
    topic  Откриването на резисторите</s>Изследване на материали, които предизвикват намаление в отклонението, когато се свържат последователно с нашия измервателен уред. </s>Khan Academy (български език) Наука Физика Открития и проекти Откриването на резисторите
 content  Предизвикателство с компас</s>Отговори на няколко въпроса за откритията, свързани с компаса
Input Example[1]: 
    topic  Откриването на резисторите</s>Изследване на материали, които предизвикват намаление в отклонението, когато се свържат последователно с нашия измервателен уред. </s>Khan Academy (български език) Наука Физика Открития и проекти Откриването на резисторите
 content  Отблъскване на електронните двойки във валентния слой при 4 електронни облака</s>Отблъскване на електронните двойки във валентния слой при 4 електронни облака
и примери за тетраедрични, тригонално-пирамидални и огънати молекули.


validation loading...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


  0%|          | 0/359500 [00:00<?, ?it/s]

Input Example[0]: 
    topic  Quiz: materials and techniques</s></s>Khan Academy (English - CBSE India Curriculum) Arts and humanities Special topics in art history Creating & conserving  Quiz: materials and techniques
 content  Decoding art: the Staff God relief from Chavín de Huántar (part 2)</s>This video supports the Smarthistory essay here:
https://smarthistory.org/staff-god-chavin/ Speaker: Dr. Naraelle Hohensee


Input Example[1]: 
    topic  Quiz: materials and techniques</s></s>Khan Academy (English - CBSE India Curriculum) Arts and humanities Special topics in art history Creating & conserving  Quiz: materials and techniques
 content  Decoding art: the Staff God relief from Chavín de Huántar (part 1)</s>This video supports the Smarthistory essay here:
https://smarthistory.org/staff-god-chavin/ Speaker: Dr. Naraelle Hohensee




encode sentence to sort by token length:   0%|          | 0/359500 [00:00<?, ?it/s]

train: 
 0    2495837
1     245680
Name: target, dtype: int64
valid: 
 0    332563
1     26937
Name: target, dtype: int64


In [9]:
df_train

Unnamed: 0,topic_id,predictions,target,topic_sentence,topic_language,content_sentence,content_language
0,t_00004da3a1b2,c_0e4fd950a390,0,Откриването на резисторите</s>Изследване на ма...,bg,Предизвикателство с компас</s>Отговори на няко...,bg
1,t_00004da3a1b2,c_2bd2dc99af8a,0,Откриването на резисторите</s>Изследване на ма...,bg,Отблъскване на електронните двойки във валентн...,bg
2,t_00004da3a1b2,c_17428d1c4cea,0,Откриването на резисторите</s>Изследване на ма...,bg,"Големина на електричното поле, създадено от за...",bg
3,t_00004da3a1b2,c_76ee9707f381,0,Откриването на резисторите</s>Изследване на ма...,bg,Елекромагнитно поле (отгоре и отдолу)</s>Прове...,bg
4,t_00004da3a1b2,c_a1ff66b3434f,0,Откриването на резисторите</s>Изследване на ма...,bg,Проводимост на електролити (зависимост от разс...,bg
...,...,...,...,...,...,...,...
2741512,t_fffe14f1be1e,c_0b15daf069f2,0,Lección 7</s></s>CREE Para el Estudiante I Cic...,es,Quién es caperucita roja</s>,es
2741513,t_fffe14f1be1e,c_fa214876a5e2,0,Lección 7</s></s>CREE Para el Estudiante I Cic...,es,¡A jugar con las letras!</s>,es
2741514,t_fffe14f1be1e,c_cece166bad6a,1,Lección 7</s></s>CREE Para el Estudiante I Cic...,es,Juego con las palabras</s>,es
2741515,t_fffe14f1be1e,c_bc3f72d46bf3,0,Lección 7</s></s>CREE Para el Estudiante I Cic...,es,Me instruyo para saber más</s>,es


In [10]:
df_valid

Unnamed: 0,topic_id,predictions,target,topic_sentence,topic_language,content_sentence,content_language,tokenize_length
0,t_30dd476279c8,c_38ac2babd9bd,0,Medicine</s></s>Medicine,en,Cancer</s>,en,10
1,t_30dd476279c8,c_909a428742d6,0,Medicine</s></s>Medicine,en,AIDS</s>,en,10
2,t_30dd476279c8,c_ee401e1ae04b,0,Medicine</s></s>Medicine,en,Education</s>,en,10
3,t_30dd476279c8,c_d9394a93b4bb,0,Medicine</s></s>Medicine,en,Diabetes</s>,es,10
4,t_30dd476279c8,c_bd164013582b,0,Medicine</s></s>Medicine,en,Cancer</s>,en,10
...,...,...,...,...,...,...,...,...
359495,t_0d5145117aa3,c_1f3039ae9038,0,B4.2.2.2 Use reading readiness and word identi...,en,Document 2</s>This Unit 2 Workbook contains wo...,en,256
359496,t_0d5145117aa3,c_1953d977aca9,0,B4.2.2.2 Use reading readiness and word identi...,en,Document 1</s>This Unit 4 Teacher Guide contai...,en,256
359497,t_0d5145117aa3,c_1565fbeb6b32,0,B4.2.2.2 Use reading readiness and word identi...,en,Document 3</s>This Unit 2 Teacher Guide contai...,en,256
359498,t_d5d3bd78d83c,c_b06cca00b5ea,0,গাণিতিক বাক্য এবং হিসাবের ধারাবাহিকতা</s></s>খ...,bn,পূর্ণ সংখ্যা বিশিষ্ট বহু ধাপের সমস্যা</s>যেকোন...,bn,256


In [11]:
df_valid.to_csv(CFG.input_dir+"df_valid_for_exp006.csv", index=False)
df_train.to_csv(CFG.input_dir+"df_train_for_exp006.csv", index=False)