In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [8]:
# ===============================================================
#  DataLoading
# ===============================================================

def check_contains(row):
    """Check whether the content_id is included in the label. Set to 1 if included, 0 if not."""
    try: # row["content_ids"]が空でないとき
        ground_truth = set(row["content_ids"])
        pred = row["predictions"]
        if pred in ground_truth:
            return 1
        else:
            return 0
    except: # row["content_ids"]が空の時
        return 0
    

def determine(df_topics):
    """
    Determine if the extracted content is correct
    """
    df = df_topics[["topic_id", "predictions"]]
    df.loc[:, "predictions"] = df["predictions"].str.split()
    df = df.explode("predictions", ignore_index=True)
    df = pd.merge(df,
                    df_topics[["topic_id", "content_ids"]],
                    on="topic_id", how="left")
    df.loc[:, "content_ids"] = df["content_ids"].apply(lambda x: x.split())
    tqdm.pandas()
    df.loc[:, "target"] = df.progress_apply(check_contains, axis=1)
    df.drop("content_ids", axis=1, inplace=True)
    return df


def tokenize(cfg, text, text_pair):
    """encode to sort by token_length"""
    inputs = cfg.tokenizer.encode_plus(
        text,
        text_pair,
        return_tensors = None, 
        add_special_tokens = True, 
        # pad_to_max_length = True,
        max_length = cfg.max_len,
        truncation = True
    )["input_ids"]
    return inputs


def prepare_df(cfg, df_corr, df_topics, df_content, data_type):
    print(f"{data_type} loading...")
    if data_type == "train":
        path = cfg.input_dir+f"train_top_50_ver3.pkl"
    elif data_type == "validation":
        path = cfg.input_dir+f"validation_top_50_ver2.pkl"

    with open(path, "rb") as f:
        loaded_list = pickle.load(f)

    df = pd.DataFrame(
        [(query_id, corpus_id, score) for query_id, pairs in loaded_list.items() for corpus_id, score in pairs], 
        columns=['topic_id', 'predictions', 'score']
        )
    df_1 = df.copy(deep=False)
    df = df.groupby("topic_id")[["predictions"]].agg(list).reset_index()
    df["predictions"] = df["predictions"].apply(lambda x:" ".join(x))
    df = pd.merge(df, df_corr[["topic_id", "content_ids"]], on="topic_id", how="left")
    df = determine(df)
    
    df = pd.merge(df, 
                  df_topics[
                      ["topic_id", 
                       "topic_language", 
                       "topic_channel",
                       "topic_category",
                       "topic_level"]
                       ], 
                  on="topic_id", 
                  how="left")
    df["predictions"] = df["predictions"].str.split()
    df = df.explode("predictions", ignore_index=True)

    df = pd.merge(df,
                  df_1,
                  on=["topic_id", "predictions"], how="left")

    df = pd.merge(df, 
                  df_content[
                      ["id", 
                       "content_language",
                       "content_kind"]
                       ].rename(columns={"id":"predictions"}),
                  on="predictions", 
                  how="left")
    return df


def load_data(cfg):
    df_topics = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/topics.csv").fillna({"title":"", "description":""})
    df_content = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/content.csv").fillna({"title":" ", "description":"", "text":""})

    df_corr = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/correlations.csv")

    df_topics.rename(columns={
        "language":"topic_language",
        "channel":"topic_channel",
        "category":"topic_category",
        "level":"topic_level"
        }, inplace=True)
    
    df_content.rename(columns={
        "language":"content_language",
        "kind":"content_kind",
        }, inplace=True)

    df_valid = prepare_df(cfg, df_corr, df_topics, df_content, data_type="validation")

    print("valid: \n", df_valid["target"].value_counts())
    return  df_valid

In [9]:
class CFG:
    max_len = 256
    debug = False
    input_dir = "/content/drive/MyDrive/KAGGLE-LECR/last_data/1st/exp006/fold0/"

In [10]:
import pickle
from tqdm.auto import tqdm

In [11]:
df_valid = load_data(CFG)

validation loading...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


  0%|          | 0/359500 [00:00<?, ?it/s]

valid: 
 0    332563
1     26937
Name: target, dtype: int64


In [12]:
df_valid

Unnamed: 0,topic_id,predictions,target,topic_language,topic_channel,topic_category,topic_level,score,content_language,content_kind
0,t_0012a45fa09c,c_dde078b8ea7a,1,en,2ee29d,aligned,4,0.743472,en,exercise
1,t_0012a45fa09c,c_139907ad9cf0,0,en,2ee29d,aligned,4,0.669632,en,video
2,t_0012a45fa09c,c_05a5aed8ccec,0,en,2ee29d,aligned,4,0.613423,en,document
3,t_0012a45fa09c,c_26fd56fd6382,0,en,2ee29d,aligned,4,0.597120,en,video
4,t_0012a45fa09c,c_92153d6a566d,0,en,2ee29d,aligned,4,0.595151,en,exercise
...,...,...,...,...,...,...,...,...,...,...
359495,t_fff9e5407d13,c_026db653a269,1,es,71fd51,supplemental,2,0.551677,es,html5
359496,t_fff9e5407d13,c_b894de7e8725,0,es,71fd51,supplemental,2,0.551287,pt,html5
359497,t_fff9e5407d13,c_f7e15fcee474,0,es,71fd51,supplemental,2,0.550654,pt,html5
359498,t_fff9e5407d13,c_4fee67386c4a,0,es,71fd51,supplemental,2,0.550607,es,html5


In [13]:
df_topics = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/topics.csv").fillna({"title":"", "description":""})
df_content = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/content.csv").fillna({"title":" ", "description":"", "text":""})

df_corr = pd.read_csv("/content/drive/MyDrive/KAGGLE-LECR/correlations.csv")

df_topics.rename(columns={
    "language":"topic_language",
    "channel":"topic_channel",
    "category":"topic_category",
    "level":"topic_level"
    }, inplace=True)

df_content.rename(columns={
    "language":"content_language",
    "kind":"content_kind",
    }, inplace=True)


In [14]:
data_type = "validation"
print(f"{data_type} loading...")
if data_type == "train":
    path = CFG.input_dir+f"train_top_50_ver3.pkl"
elif data_type == "validation":
    path = CFG.input_dir+f"validation_top_50_ver2.pkl"

with open(path, "rb") as f:
    loaded_list = pickle.load(f)

df = pd.DataFrame(
    [(query_id, corpus_id, score) for query_id, pairs in loaded_list.items() for corpus_id, score in pairs], 
    columns=['topic_id', 'predictions', 'score']
    )

df_1 = df.copy(deep=False)
df = df.groupby("topic_id")[["predictions", "score"]].agg(list).reset_index()
df["predictions"] = df["predictions"].apply(lambda x:" ".join(x))

validation loading...


In [15]:
df_1

Unnamed: 0,topic_id,predictions,score
0,t_0012a45fa09c,c_dde078b8ea7a,0.743472
1,t_0012a45fa09c,c_139907ad9cf0,0.669632
2,t_0012a45fa09c,c_05a5aed8ccec,0.613423
3,t_0012a45fa09c,c_26fd56fd6382,0.597120
4,t_0012a45fa09c,c_92153d6a566d,0.595151
...,...,...,...
359495,t_fff9e5407d13,c_026db653a269,0.551677
359496,t_fff9e5407d13,c_b894de7e8725,0.551287
359497,t_fff9e5407d13,c_f7e15fcee474,0.550654
359498,t_fff9e5407d13,c_4fee67386c4a,0.550607


In [16]:
df = pd.merge(df, df_corr[["topic_id", "content_ids"]], on="topic_id", how="left")
df = determine(df)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


  0%|          | 0/359500 [00:00<?, ?it/s]

Unnamed: 0,topic_id,predictions,target
0,t_0012a45fa09c,c_dde078b8ea7a,1
1,t_0012a45fa09c,c_139907ad9cf0,0
2,t_0012a45fa09c,c_05a5aed8ccec,0
3,t_0012a45fa09c,c_26fd56fd6382,0
4,t_0012a45fa09c,c_92153d6a566d,0
...,...,...,...
359495,t_fff9e5407d13,c_026db653a269,1
359496,t_fff9e5407d13,c_b894de7e8725,0
359497,t_fff9e5407d13,c_f7e15fcee474,0
359498,t_fff9e5407d13,c_4fee67386c4a,0


In [17]:
df = pd.merge(df, 
                df_topics[
                    ["topic_id", 
                    "topic_language", 
                    "topic_channel",
                    "topic_category",
                    "topic_level"]
                    ], 
                on="topic_id", 
                how="left")
df

Unnamed: 0,topic_id,predictions,target,topic_language,topic_channel,topic_category,topic_level
0,t_0012a45fa09c,c_dde078b8ea7a,1,en,2ee29d,aligned,4
1,t_0012a45fa09c,c_139907ad9cf0,0,en,2ee29d,aligned,4
2,t_0012a45fa09c,c_05a5aed8ccec,0,en,2ee29d,aligned,4
3,t_0012a45fa09c,c_26fd56fd6382,0,en,2ee29d,aligned,4
4,t_0012a45fa09c,c_92153d6a566d,0,en,2ee29d,aligned,4
...,...,...,...,...,...,...,...
359495,t_fff9e5407d13,c_026db653a269,1,es,71fd51,supplemental,2
359496,t_fff9e5407d13,c_b894de7e8725,0,es,71fd51,supplemental,2
359497,t_fff9e5407d13,c_f7e15fcee474,0,es,71fd51,supplemental,2
359498,t_fff9e5407d13,c_4fee67386c4a,0,es,71fd51,supplemental,2


In [18]:
df = pd.merge(df, df_1, on=["topic_id", "predictions"], how="left")

In [19]:
df = pd.merge(df, 
                df_content[
                    ["id", 
                    "content_language",
                    "content_kind"]
                    ].rename(columns={"id":"predictions"}),
                on="predictions", 
                how="left")
df

Unnamed: 0,topic_id,predictions,target,topic_language,topic_channel,topic_category,topic_level,score,content_language,content_kind
0,t_0012a45fa09c,c_dde078b8ea7a,1,en,2ee29d,aligned,4,0.743472,en,exercise
1,t_0012a45fa09c,c_139907ad9cf0,0,en,2ee29d,aligned,4,0.669632,en,video
2,t_0012a45fa09c,c_05a5aed8ccec,0,en,2ee29d,aligned,4,0.613423,en,document
3,t_0012a45fa09c,c_26fd56fd6382,0,en,2ee29d,aligned,4,0.597120,en,video
4,t_0012a45fa09c,c_92153d6a566d,0,en,2ee29d,aligned,4,0.595151,en,exercise
...,...,...,...,...,...,...,...,...,...,...
359495,t_fff9e5407d13,c_026db653a269,1,es,71fd51,supplemental,2,0.551677,es,html5
359496,t_fff9e5407d13,c_b894de7e8725,0,es,71fd51,supplemental,2,0.551287,pt,html5
359497,t_fff9e5407d13,c_f7e15fcee474,0,es,71fd51,supplemental,2,0.550654,pt,html5
359498,t_fff9e5407d13,c_4fee67386c4a,0,es,71fd51,supplemental,2,0.550607,es,html5


In [20]:
from sklearn.model_selection import StratifiedGroupKFold
sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)
for i, (_, val) in enumerate(sgkf.split(X=df, y=df["target"], groups=df["topic_channel"])):
    df.loc[val, "fold"] = int(i)
print(df.groupby("fold")["target"].value_counts())

fold  target
0.0   0           9436
      1            664
1.0   0         203340
      1          18510
2.0   0          35270
      1           2180
3.0   0          49765
      1           2835
4.0   0          34752
      1           2748
Name: target, dtype: int64


In [21]:
from sklearn.model_selection import GroupKFold
gk = GroupKFold(n_splits=5)
for i, (_, val) in enumerate(gk.split(X=df, groups=df["topic_channel"])):
    df.loc[val, "fold"] = int(i)
print(df.groupby("fold")["target"].value_counts())

fold  target
0.0   0         165102
      1          16048
1.0   0          41934
      1           2416
2.0   0          42147
      1           2453
3.0   0          41533
      1           3267
4.0   0          41847
      1           2753
Name: target, dtype: int64


In [22]:
df.to_csv("/content/drive/MyDrive/KAGGLE-LECR/last_data/3rd/df_ver2.csv", index=False)

In [23]:
df

Unnamed: 0,topic_id,predictions,target,topic_language,topic_channel,topic_category,topic_level,score,content_language,content_kind,fold
0,t_0012a45fa09c,c_dde078b8ea7a,1,en,2ee29d,aligned,4,0.743472,en,exercise,0.0
1,t_0012a45fa09c,c_139907ad9cf0,0,en,2ee29d,aligned,4,0.669632,en,video,0.0
2,t_0012a45fa09c,c_05a5aed8ccec,0,en,2ee29d,aligned,4,0.613423,en,document,0.0
3,t_0012a45fa09c,c_26fd56fd6382,0,en,2ee29d,aligned,4,0.597120,en,video,0.0
4,t_0012a45fa09c,c_92153d6a566d,0,en,2ee29d,aligned,4,0.595151,en,exercise,0.0
...,...,...,...,...,...,...,...,...,...,...,...
359495,t_fff9e5407d13,c_026db653a269,1,es,71fd51,supplemental,2,0.551677,es,html5,2.0
359496,t_fff9e5407d13,c_b894de7e8725,0,es,71fd51,supplemental,2,0.551287,pt,html5,2.0
359497,t_fff9e5407d13,c_f7e15fcee474,0,es,71fd51,supplemental,2,0.550654,pt,html5,2.0
359498,t_fff9e5407d13,c_4fee67386c4a,0,es,71fd51,supplemental,2,0.550607,es,html5,2.0
