In [1]:
%env TOKENIZERS_PARALLELISM=false

env: TOKENIZERS_PARALLELISM=false


In [2]:
%%writefile get_topic_context.py
"""
 get topic context.py:
     get topic ancestor titles and join these
     ref: 
         https://www.kaggle.com/code/jamiealexandre/tips-and-recommendations-from-hosts/notebook
     ex:
         [grandparent topic title] + " " + [parent topic title] + " " + [topic title]
"""

# ================================================================
#  Library
# ================================================================
import os
import argparse
import numpy as np
import pandas as pd
from tqdm.auto import tqdm


# ================================================================
#  CFG
# ================================================================
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, required=False,
                       default=42)
    parser.add_argument("--competition_dir", type=str, required=False,
                       default="/kaggle/input/learning-equality-curriculum-recommendations/")
    parser.add_argument("--output_dir", type=str, required=False,
                       default="/kaggle/working/get_topic_context/")
    parser.add_argument("--debug", action="store_true", required=False)
    return parser.parse_args()

CFG = parse_args()
if not os.path.exists(CFG.output_dir):
    os.makedirs(CFG.output_dir)
for k, v in vars(CFG).items():
    print(f"{k}: {v}")

# ================================================================
#  Utils
# ================================================================
def seed_everything(cfg):
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
seed_everything(CFG)
    
# ================================================================
#  DataLoading
# ================================================================
df_topics = pd.read_csv(CFG.competition_dir+"topics.csv").fillna({"title": "", 
                                                                  "description": "",
                                                                  "parent": ""
                                                                 })
df_topics.rename(columns={"id":"topic_id"}, inplace=True)

if CFG.debug:
    sample_submission = df_topics.copy(deep=False).sample(n=10_000, random_state=CFG.seed)[["topic_id"]]
else:
    sample_submission = pd.read_csv(CFG.competition_dir+"sample_submission.csv")
sample_submission = pd.merge(sample_submission, df_topics[["topic_id", "channel", "title", "description"]], 
                             on="topic_id", how="left")


# ================================================================
#  get_context
# ================================================================
def get_ancestors(topic_id):
    topic_title = df_topics[df_topics["topic_id"] == topic_id]["title"].values[0]
    topic_text = []
    while True:
        topic_text.append(topic_title)
        parent_id = df_topics[df_topics["topic_id"] == topic_id]["parent"].values[0]
        if parent_id == "":
            break
        parent_title = df_topics[df_topics["topic_id"] == parent_id]["title"].values[0]
        topic_title = parent_title
        topic_id = parent_id
    topic_text.reverse()
    return "  ".join(topic_text)

tqdm.pandas()
sample_submission["context"] = sample_submission["topic_id"].progress_apply(get_ancestors)
print(sample_submission.isnull().sum())
print(sample_submission["context"].values[:2])
sample_submission.to_csv(CFG.output_dir+"topics.csv", index=False)

Writing get_topic_context.py


In [3]:
%%writefile 1st_stage_model.py
"""
1st_stage_model.py
    retrive candidates from a lot of contents
"""
# ================================================================
#  Library
# ================================================================
import os
import gc
import pickle
import heapq
import random
import argparse
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import warnings
warnings.simplefilter("ignore")

import torch 
from torch import Tensor

import transformers
from transformers import AutoTokenizer
transformers.logging.set_verbosity_error()

import sys
sys.path.append("/kaggle/input/sentence-transformers/")
from sentence_transformers import models, SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

# ================================================================
#  args
# ================================================================
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, required=False, 
                        default=42)
    parser.add_argument("--competition_dir", type=str, required=False,
                        default="/kaggle/input/learning-equality-curriculum-recommendations/")
    parser.add_argument("--data_dir", type=str, required=False,
                        default="/kaggle/input/1st-stage-")
    parser.add_argument("--topic_dir", type=str, required=False,
                       default="/kaggle/working/get_topic_context/")
    parser.add_argument("--output_dir", type=str, required=False,
                        default="/kaggle/working/1st_stage_model/")
    parser.add_argument("--base_model", type=str, required=False,
                        default="sentence-transformers/all-mpnet-base-v2")
    parser.add_argument("--filename", type=str,required=False, choices=["exp004", "exp006", "exp007"],
                        default="exp006")
    parser.add_argument("--max_len", type=int, required=False,
                       default=128)
    parser.add_argument("--n_neighbors", type=int, required=False,
                       default=50)
    parser.add_argument("--corpus_chunk_size", type=int, required=False,
                       default=40_000)
    parser.add_argument("--batch_size", type=int, required=False,
                       default=96)
    parser.add_argument("--n_fold", type=int, required=False, nargs="*",
                       default=[0, 1, 2])
    parser.add_argument("--debug", action="store_true", required=False)
    args = parser.parse_args()
        
    args.data_dir = args.data_dir + f"{args.filename}/"
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)    
    return args


# ================================================================
#  Utils
# ================================================================
def seed_everything(cfg):
    """set seed"""
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True
    

def cos_sim(a: Tensor, b: Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    
    cited: https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/util.py
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))


# ================================================================
#  Data Loading
# ================================================================
def data_load(cfg):
    print("========== Data Loading ==========")
    df_content = pd.read_csv(cfg.competition_dir+"content.csv").fillna({"title": "", 
                                                                        "description": "", 
                                                                        "text":""})
    if cfg.debug:
        df_content = df_content.sample(n=100, random_state=cfg.seed).reset_index(drop=True)
    
    df_content.rename(columns={"id":"content_id"}, inplace=True)
    df_topics = pd.read_csv(cfg.topic_dir+"topics.csv").fillna({"title": "", 
                                                                  "description": "",
                                                                  "parent": ""
                                                                 })
    cfg.tokenizer = AutoTokenizer.from_pretrained(
        "/kaggle/input/1st-stage-exp006/fold0/sentence-transformers-all-mpnet-base-v2_fine-tuned", 
        is_fast=True)
            
    df_content["sentence"] = df_content["title"] + cfg.tokenizer.sep_token + df_content["description"]
    df_topics["sentence"] = df_topics["title"] + cfg.tokenizer.sep_token +  df_topics["description"] +\
    cfg.tokenizer.sep_token + df_topics["context"]
    
    df_content["content_sentence"] = df_content["sentence"]
    df_topics["topic_sentence"] = df_topics["sentence"]    
    print("df_topics: ", df_topics.shape)
    print("df_content: ", df_content.shape)
    print("Input Sentence Example:")
    print("========== Topics ==========")
    print(df_topics["sentence"].values.tolist()[:2])
    print("========== Content ==========")
    print(df_content["sentence"].values.tolist()[:2])
    
    print(df_topics.isnull().sum())

    return df_content, df_topics


def prepare_valid(df_content: pd.DataFrame, df_topics: pd.DataFrame):
    """
    Create a query and corpus like the folloing.
    
    ex)
    queries = {'q1': 'What is machine learning?',
               'q2': 'How does deep learning work?'}
    corpus = {'d1': 'Machine learning is a method of data analysis.', 
              'd2': 'Deep learning is a subfield of machine learning.', 
              'd3': 'Neural networks are used in deep learning.'}
    """
    queries = df_topics[["topic_id", "topic_sentence"]].set_index('topic_id').to_dict()['topic_sentence']
    corpus = df_content[["content_id", "content_sentence"]].set_index(
                                                        'content_id').to_dict()['content_sentence']
    return queries, corpus


# ================================================================
#  Convert embeddings
# ================================================================
def FeatureExtractor(cfg):
    """model"""
    word_embedding_model = models.Transformer(cfg.model, max_seq_length=cfg.max_len)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 
                                   pooling_mode='mean')
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return model

def get_pair(cfg, queries: dict, corpus: dict, model, device):
    """
    https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/evaluation/InformationRetrievalEvaluator.py
    """    
    model.eval()
    queries_ids = list(queries.keys())
    queries = [queries[qid] for qid in queries_ids]

    corpus_ids = list(corpus.keys())
    corpus = [corpus[cid] for cid in corpus_ids]

    query_embeddings = model.encode(queries,  
                                    batch_size=cfg.batch_size, 
                                    convert_to_tensor=True)

    queries_result_list = [[] for _ in range(len(query_embeddings))]

    for corpus_start_idx in tqdm(range(0, len(corpus), cfg.corpus_chunk_size), desc="encode corpus & keep pairs"):
        corpus_end_idx = min(corpus_start_idx + cfg.corpus_chunk_size, len(corpus))

        sub_corpus_embeddings = model.encode(corpus[corpus_start_idx:corpus_end_idx], 
                                             show_progress_bar=False, 
                                             batch_size=cfg.batch_size, 
                                             convert_to_tensor=True)

        # Compute cosine similarites
        pair_scores = cos_sim(query_embeddings, sub_corpus_embeddings)

        # Get top-k values
        pair_scores_top_k_values, pair_scores_top_k_idx = torch.topk(pair_scores, 
                                                                     min(cfg.n_neighbors, 
                                                                         len(pair_scores[0])), 
                                                                     dim=1, largest=True, sorted=False)
        
        pair_scores_top_k_values = pair_scores_top_k_values.cpu().tolist()
        pair_scores_top_k_idx = pair_scores_top_k_idx.cpu().tolist()

        for query_itr in range(len(query_embeddings)):
            for sub_corpus_id, score in zip(pair_scores_top_k_idx[query_itr], pair_scores_top_k_values[query_itr]):
                corpus_id = corpus_ids[corpus_start_idx+sub_corpus_id]
                if len(queries_result_list[query_itr]) < cfg.n_neighbors:
                    heapq.heappush(queries_result_list[query_itr], (score, corpus_id))  # heaqp tracks the quantity of the first element in the tuple
                else:
                    heapq.heappushpop(queries_result_list[query_itr], (score, corpus_id))

    for query_itr in range(len(queries_result_list)):
        for doc_itr in range(len(queries_result_list[query_itr])):
            score, corpus_id = queries_result_list[query_itr][doc_itr]
            queries_result_list[query_itr][doc_itr] = {'corpus_id': corpus_id, 'score': score}
    return queries_ids, queries_result_list


# ================================================================
#  save_pair
# ================================================================
def save_pair(cfg, fold, queries_ids, queries_result_list):
    pair = {}
    for query_itr in range(len(queries_result_list)):
        query_id = queries_ids[query_itr]
        # Sort scores
        top_hits = sorted(queries_result_list[query_itr], key=lambda x: x['score'], reverse=True)
        corpus_id_list = [(d['corpus_id'], d['score']) for d in top_hits[0:cfg.n_neighbors]]
        pair[query_id] = corpus_id_list
        
    path = cfg.output_dir+f"{cfg.filename}_fold{fold}_top{cfg.n_neighbors}.pkl"
    with open(path, "wb") as f:
        pickle.dump(pair, f)
    print(f"{path} saved!")
    
# ===============================================================
#  main
# ===============================================================
def main(cfg):
    seed_everything(cfg)
    df_content, df_topics = data_load(cfg)
    queries, corpus = prepare_valid(df_content, df_topics)   
    
    for fold in cfg.n_fold:        
        # set model
        cfg.model = cfg.data_dir+ f"fold{fold}/" + cfg.base_model.replace("/", "-") + "_fine-tuned/"
        print(cfg.model)
        model = FeatureExtractor(cfg)
        model.to(device)
        # get_pair
        queries_ids, queries_result_list = get_pair(cfg, queries, corpus, model, device)
        # save
        save_pair(cfg, fold, queries_ids, queries_result_list)
        del model, queries_ids, queries_result_list
        torch.cuda.empty_cache()
        gc.collect()
        print('\033[32m'+f"{cfg.model} finish."+'\033[0m')

# ===============================================================
#  Execute
# ===============================================================
if __name__ == "__main__":
    args = parse_args()
    for k, v in vars(args).items():
        print(f"{k}: {v}")
    main(args)

Writing 1st_stage_model.py


In [4]:
%%writefile 2nd_stage_model.py

# ===============================================================
#  Library
# ===============================================================
import os
import gc
import math
import time
import json
import pickle
import random
import requests
import argparse
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import warnings
warnings.simplefilter("ignore")

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader


import transformers
transformers.logging.set_verbosity_error()
from transformers import AutoConfig, AutoModel, AutoTokenizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"device: {device}")

# ===============================================================
#  args
# ===============================================================
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, required=False, 
                        default=42)
    parser.add_argument("--competition_dir", type=str, required=False,
                        default="/kaggle/input/learning-equality-curriculum-recommendations/")
    parser.add_argument("--data_dir", type=str, required=False,
                        default="/kaggle/input/2nd-stage-")
    parser.add_argument("--topic_dir", type=str, required=False,
                       default="/kaggle/working/1st_stage_model/")
    parser.add_argument("--output_dir", type=str, required=False,
                        default="/kaggle/working/2nd_stage_model/")
    parser.add_argument("--filename", type=str, required=False, 
                        default="exp004")    
    parser.add_argument("--max_len", type=int, required=False, 
                        default=256)    
    parser.add_argument("--base_model", type=str, required=False, 
                        default="sentence-transformers/all-MiniLM-L6-v2")  
    parser.add_argument("--batch_size", type=int, required=False, 
                        default=64)    
    parser.add_argument("--target_cols", type=str, required=False, nargs="*",
                        default=["target"])    
    parser.add_argument("--n_fold", type=int, required=False, nargs="*",
                        default=[0])    
    parser.add_argument("--best_epoch", type=int, required=False,
                        default=3)    
    args = parser.parse_args()
    args.data_dir = args.data_dir + f"{args.filename}/"
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)    
    return args
    
# ===============================================================
#  Utils
# ===============================================================
def seed_everything(cfg):
    """set seed"""
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True
    
# ===============================================================
#  tokenizer
# ===============================================================
def tokenizer(cfg):
    cfg.tokenizer = AutoTokenizer.from_pretrained(
        f'{cfg.data_dir}fold0/tokenizer',
        is_fast=True)
    return 

# ===============================================================
#  DataLoading
# ===============================================================
def prepare_df(cfg, df_topics, df_content):
    # load data
    path = cfg.topic_dir+f"exp006_fold0_top50.pkl"
    with open(path, "rb") as f:
        loaded_list = pickle.load(f)
    df = pd.DataFrame(
        [(query_id, corpus_id, score) for query_id, pairs in loaded_list.items() for corpus_id, score in pairs], 
        columns=['topic_id', 'predictions', 'score']
        )
    df = df.groupby("topic_id")[["predictions"]].agg(list).reset_index()
    df["predictions"] = df["predictions"].apply(lambda x:" ".join(x))
    
    # convert df for classification
    df["predictions"] = df.apply(
    lambda x: " ".join([str(val) for idx, val in enumerate(x) \
                        if pd.notna(val) and idx != df.columns.get_loc("topic_id")]), axis=1
    )
    
    df = df[["topic_id", "predictions"]]
    df = pd.merge(df, df_topics[["topic_id", "topic_sentence", "topic_length"]], on="topic_id", how="left")
    
    df["predictions"] = df["predictions"].str.split()
    df = df.explode("predictions", ignore_index=True)
    df = pd.merge(df, df_content[["id", "content_sentence", "content_length"]].rename(columns={"id":"predictions"}),
                  on="predictions", how="left")    
    
    # sort token 
    df["length"] = df["topic_length"] + df["content_length"]
    df.sort_values(by="length", ascending=True, ignore_index=True, inplace=True)
    
    return df


def load_data(cfg):
    df_topics = pd.read_csv("/kaggle/working/get_topic_context/topics.csv").fillna({"title":"", "description":""})
    df_content = pd.read_csv(cfg.competition_dir+"content.csv").fillna(
        {"title":" ", "description":"", "text":""})

    # content sentence
    df_content["content_sentence"] = df_content["title"] + cfg.tokenizer.sep_token + df_content["description"]
    print(df_content.isnull().sum())

    # topic sentence
    df_topics["topic_sentence"] = df_topics["title"] + cfg.tokenizer.sep_token +  df_topics["description"] +\
    cfg.tokenizer.sep_token + df_topics["context"]

    df_topics["topic_sentence"] = df_topics["topic_sentence"].str.replace(" >> ",  " ")
    
    # encode sentence
    df_content['content_length'] = [len(cfg.tokenizer(text)['input_ids']) \
                              for text in tqdm(df_content['content_sentence'].values, desc="encode content_sentence")]
    df_topics['topic_length'] = [len(cfg.tokenizer(text)['input_ids']) \
                             for text in tqdm(df_topics['topic_sentence'].values, desc="encode topic_sentence")]
    
    df = prepare_df(cfg, df_topics, df_content)
    print("df.shape: ", df.shape)
    print(df.isnull().sum())
    return df


# ===============================================================
#  Dataset
# ===============================================================
def prepare_input(cfg, text, text_pair):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        text_pair,
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.content_sentence = df["content_sentence"].values
        self.topic_sentence= df["topic_sentence"].values
    def __len__(self):
        return len(self.topic_sentence)
    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.topic_sentence[item], text_pair=self.content_sentence[item])
        return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs


# ===============================================================
#  Model
# ===============================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0
            self.config.hidden_dropout_prob = 0
            self.config.attention_dropout = 0
            self.config.attetnion_probs_dropout_prob = 0
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        try:
            self.model.gradient_checkpointing_enable()
        except:
            print(f'{cfg.base_model} does not support gradient checkpoint.')
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, len(cfg.target_cols))
        self._init_weights(self.fc)
        self.pooling = MeanPooling()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.paddding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            
    def forward(self, inputs):
        transformer_out = self.model(**inputs)
        last_hidden_state = transformer_out.last_hidden_state
        feature = self.pooling(last_hidden_state, inputs['attention_mask'])
        output = self.fc(feature)
        return output
    
    
# ===============================================================
#  _loop_fn
# ===============================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions


# ===============================================================
#  loop_fn
# ===============================================================
def inference_loop(cfg, df, device):
    df_dataset = TestDataset(cfg, df)
    df_loader = DataLoader(df_dataset,
                          batch_size=cfg.batch_size,
                          shuffle=False,
                          num_workers=0,
                          pin_memory=True,
                          drop_last=False)
    
    for fold in cfg.n_fold:
        model = CustomModel(args, 
                            config_path=f'{cfg.data_dir}fold{fold}/'+'config.pth',
                            pretrained=False)
        state = torch.load(
            f'{cfg.data_dir}fold{fold}/'+f"{cfg.base_model.replace('/', '-')}_fold{fold}_epoch{cfg.best_epoch}_best.pth",
                        map_location=torch.device('cpu'))   
        model.load_state_dict(state['model'])
        prediction = inference_fn(df_loader, model, device)
        torch.cuda.empty_cache()
        df["target"] = prediction
        del model, state, prediction; gc.collect()
        df.to_csv(f'{cfg.filename}_fold{fold}_submission.csv', index=False)
        print('\033[32m'+f"{cfg.filename} fold{fold} finish."+'\033[0m')

    del df, df_dataset, df_loader
    gc.collect()
    torch.cuda.empty_cache()
    

# ===============================================================
#  main
# ===============================================================
def main(cfg):
    seed_everything(cfg)
    tokenizer(cfg)
    df = load_data(cfg)    
    inference_loop(cfg, df, device)
                
# ===============================================================
#  Execution
# ===============================================================
if __name__ == "__main__":
    args = parse_args()
    for k, v in vars(args).items():
        print(f"{k}: {v}")
    main(args)

Writing 2nd_stage_model.py


In [5]:
%%writefile 3rd_stage_model.py

# ================================================================
#  Library
# ================================================================
import os
import pickle
import random
import argparse
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from collections import OrderedDict
import warnings
warnings.simplefilter('ignore')

from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error as mse

import torch

import lightgbm as lgb

# ================================================================
#  args
# ================================================================
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, required=False, default=42)
    parser.add_argument("--input_dir", type=str, required=False, 
                        default="/kaggle/working/")    
    parser.add_argument("--output_dir", type=str, required=False, 
                        default="/kaggle/working/")
    parser.add_argument("--pair_dir", type=str, required=False, 
                        default="/kaggle/working/1st_stage_model/")
    parser.add_argument("--competition_dir", type=str, required=False, 
                        default="/kaggle/input/learning-equality-curriculum-recommendations/")
    parser.add_argument("--data_dir", type=str, required=False, 
                        default="/kaggle/input/3rd-stage-model/")
    parser.add_argument("--filenames", type=str, required=False, nargs="*",
                       default=[])
    parser.add_argument("--threshold", type=float, required=False, default=0)
    parser.add_argument("--add_top", type=int, required=False, default=0)
    return parser.parse_args()

# ================================================================
#  Utils
# ================================================================
def seed_everything(cfg):
    """set seed"""
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True
    
    
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
# ================================================================
#  Data_Loading
# ================================================================
def load_data(cfg):    
    # base_pair
    path = cfg.pair_dir+f"exp006_fold0_top50.pkl"
    with open(path, "rb") as f:
        loaded_list = pickle.load(f)
    df = pd.DataFrame(
        [(query_id, corpus_id, score) for query_id, pairs in loaded_list.items() for corpus_id, score in pairs], 
        columns=['topic_id', 'content_ids', 'score']
        )
    
    for i, filename in enumerate(cfg.filenames):
        sub_df = pd.read_csv(cfg.input_dir+f"{filename}_fold0_submission.csv").rename(columns={"predictions":"content_ids", 
                                                                                               "target":f"valid_pred_ver{i}"})
        df = pd.merge(df,
                     sub_df[["topic_id", "content_ids", f"valid_pred_ver{i}"]],
                     on=["topic_id", "content_ids"], how="left")
    # merge topic information
    df_topics = pd.read_csv(cfg.competition_dir+"topics.csv")    
    df_topics.rename(columns={"id":"topic_id",
                              "language":"topic_language",
                              "channel":"topic_channel",
                              "category":"topic_category", 
                              "level":"topic_level"
                              },
                     inplace=True)
    df = pd.merge(df, df_topics[["topic_id", "topic_language", "topic_channel","topic_category","topic_level"]], 
                  on="topic_id", how="left")
    
    # merge content information
    df_content = pd.read_csv(cfg.competition_dir+"content.csv")
    df_content.rename(columns={"language":"content_language",
                               "kind":"content_kind",
                               "id":"content_ids"},
                      inplace=True)
    df = pd.merge(df, df_content[["content_ids", "content_language","content_kind"]],
                  on="content_ids", how="left")
    return df


# ================================================================
#  Feature_Engineering
# ================================================================
def do_fe(cfg, df):
    # topicごとの統計量
    for i in range(4):
        sub_df = df.groupby("topic_id")[f"valid_pred_ver{i}"].agg(["mean", "min", "max", "std"])\
                    .rename(columns=lambda x: f"{x}_per_topic_ver{i}").reset_index()
        sub_df[f"range_per_topic_ver{i}"] = sub_df[f"max_per_topic_ver{i}"] - sub_df[f"min_per_topic_ver{i}"]
        df = pd.merge(df, sub_df, on="topic_id", how="left")
        
    # category
    df["topic_category_"] = df["topic_category"].map({"aligned":0, "supplemental":1})
    
    # kind
    df["content_kind_"] = df["content_kind"].map({"document":0, 
                                                  "video":1,
                                                  "exercise":2,
                                                  "audio":3,
                                                  "html5":4})
    
    # language
    df["topic_language_is_en"] = np.where(df["topic_language"] == "en", 1, 0)
    df["topic_language_is_es"] = np.where(df["topic_language"] == "es", 1, 0)

    df["content_language_is_en"] = np.where(df["content_language"] == "en", 1, 0)
    df["content_language_is_es"] = np.where(df["content_language"] == "es", 1, 0)

    df["language_is_the_same"] = np.where(df["topic_language"] == df["content_language"], 1, 0)
    
    
    cfg.use_features = ['topic_level', 'score', 'valid_pred_ver0', 'valid_pred_ver1',
           'valid_pred_ver2', 'valid_pred_ver3', 'mean_per_topic_ver0',
           'min_per_topic_ver0', 'max_per_topic_ver0', 'std_per_topic_ver0',
           'range_per_topic_ver0', 'mean_per_topic_ver1', 'min_per_topic_ver1',
           'max_per_topic_ver1', 'std_per_topic_ver1', 'range_per_topic_ver1',
           'mean_per_topic_ver2', 'min_per_topic_ver2', 'max_per_topic_ver2',
           'std_per_topic_ver2', 'range_per_topic_ver2', 'mean_per_topic_ver3',
           'min_per_topic_ver3', 'max_per_topic_ver3', 'std_per_topic_ver3',
           'range_per_topic_ver3', 'topic_category_', 'content_kind_',
           'topic_language_is_en', 'topic_language_is_es',
           'content_language_is_en', 'content_language_is_es',
           'language_is_the_same']
    
    cfg.cat_features = ["topic_category_", "content_kind_",  'topic_language_is_en', 'topic_language_is_es',
           'content_language_is_en', 'content_language_is_es', 'topic_level',
           'language_is_the_same']
    return df


# ================================================================
#  predict
# ================================================================
def lgb_predict(cfg, df):
    predictions = []
    for fold in tqdm(range(3)):
        model_path = cfg.data_dir + f"lgb_fold{fold}_model.pkl"
        with open(model_path, 'rb') as fin:
            clf = pickle.load(fin)
        prediction = clf.predict(df[cfg.use_features].values)
        predictions.append(prediction)
    predictions = np.median(predictions, axis=0)
    df["sigmoid"] = predictions
    print(predictions)
    return df

# ================================================================
#  get_submission
# ================================================================
def get_submission(cfg, df):
    # それぞれのトピックで確率が大きいコンテンツを取得する
    top_df = df.groupby('topic_id').apply(lambda x: x.sort_values(by='sigmoid', ascending=False
                                                                 ).head(cfg.add_top)).reset_index(drop=True)
    top_df = pd.DataFrame(top_df.groupby("topic_id")["content_ids"].agg(list)).reset_index()
    top_df["content_ids"] = top_df["content_ids"].apply(lambda x: " ".join(x))
    
    # 閾値より確率が大きいコンテンツを取得
    df = df[df["sigmoid"] > cfg.threshold].reset_index(drop=True)
    df = df.groupby("topic_id")["content_ids"].agg(list).reset_index()
    df["content_ids"] = df["content_ids"].apply(lambda x:" ".join(x))
    
    # 各トピックに対して最低コンテンツを割り振るようにしたい
    least_df = top_df[~top_df["topic_id"].isin(df["topic_id"].values)].reset_index(drop=True)
    df = pd.concat([df, least_df], ignore_index=True)
    
    # save
    df.to_csv("submission.csv", index=False)
    print('\033[32m'+"Save submission.csv"+'\033[0m')
    
# ================================================================
#  main
# ================================================================
def main(cfg):
    seed_everything(cfg)
    df = load_data(cfg)
    df = do_fe(cfg, df)
    df = lgb_predict(cfg, df)
    get_submission(cfg, df)
    
# ================================================================
#  execute
# ================================================================
if __name__ == "__main__":
    args = parse_args()
    for k, v in vars(args).items():
        print(f"{k}: {v}")
    main(args)

Writing 3rd_stage_model.py


In [6]:
!python get_topic_context.py

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
seed: 42
competition_dir: /kaggle/input/learning-equality-curriculum-recommendations/
output_dir: /kaggle/working/get_topic_context/
debug: False
100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 28.09it/s]
topic_id       0
content_ids    0
channel        0
title          0
description    0
context        0
dtype: int64
['Khan Academy (български език)  Наука  Физика  Открития и проекти  Откриването на резисторите'
 'Khan Academy (Português (Brasil))  Matemática por ano (Alinhada à BNCC)  9º Ano  Álgebra: funções  Entradas e saídas de uma função']


In [7]:
!python 1st_stage_model.py\
--n_neighbors 50\
--n_fold 0

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
device: cuda
seed: 42
competition_dir: /kaggle/input/learning-equality-curriculum-recommendations/
data_dir: /kaggle/input/1st-stage-exp006/
topic_dir: /kaggle/working/get_topic_context/
output_dir: /kaggle/working/1st_stage_model/
base_model: sentence-transformers/all-mpnet-base-v2
filename: exp006
max_len: 128
n_neighbors: 50
corpus_chunk_size: 40000
batch_size: 96
n_fold: [0]
debug: False
df_topics:  (5, 8)
df_content:  (154047, 10)
Input Sentence Example:
['Откриването на резисторите</s>Изследване на материали, които предизвикват намаление в отклонението, когато се свържат последователно с нашия измервателен уред. </s>Khan Academy (български език)  Наука  Физика  Открития и проекти  Откриването на резисторите', 'Entradas e saídas de uma função</s>Entenda um pouco mais sobre funções.</s>Khan Academy (Português (Brasil))  Matemática por ano (Alinhada à BNCC)  9º Ano  Ál

In [8]:
!python 2nd_stage_model.py\
--filename exp004\
--base_model sentence-transformers/all-MiniLM-L6-v2\
--best_epoch 3

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
device: cuda
seed: 42
competition_dir: /kaggle/input/learning-equality-curriculum-recommendations/
data_dir: /kaggle/input/2nd-stage-exp004/
topic_dir: /kaggle/working/1st_stage_model/
output_dir: /kaggle/working/2nd_stage_model/
filename: exp004
max_len: 256
base_model: sentence-transformers/all-MiniLM-L6-v2
batch_size: 64
target_cols: ['target']
n_fold: [0]
best_epoch: 3
id                      0
title                   0
description             0
kind                    0
text                    0
language                0
copyright_holder    82226
license             80012
content_sentence        0
dtype: int64
encode content_sentence: 100%|████████| 154047/154047 [00:28<00:00, 5419.89it/s]
encode topic_sentence: 100%|████████████████████| 5/5 [00:00<00:00, 1230.29it/s]
df.shape:  (250, 7)
topic_id            0
predictions         0
topic_sentence      0
t

In [9]:
!python 2nd_stage_model.py\
--filename exp006\
--base_model sentence-transformers/all-mpnet-base-v2\
--best_epoch 3

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
device: cuda
seed: 42
competition_dir: /kaggle/input/learning-equality-curriculum-recommendations/
data_dir: /kaggle/input/2nd-stage-exp006/
topic_dir: /kaggle/working/1st_stage_model/
output_dir: /kaggle/working/2nd_stage_model/
filename: exp006
max_len: 256
base_model: sentence-transformers/all-mpnet-base-v2
batch_size: 64
target_cols: ['target']
n_fold: [0]
best_epoch: 3
id                      0
title                   0
description             0
kind                    0
text                    0
language                0
copyright_holder    82226
license             80012
content_sentence        0
dtype: int64
encode content_sentence: 100%|████████| 154047/154047 [00:27<00:00, 5689.53it/s]
encode topic_sentence: 100%|████████████████████| 5/5 [00:00<00:00, 1197.96it/s]
df.shape:  (250, 7)
topic_id            0
predictions         0
topic_sentence      0


In [10]:
!python 2nd_stage_model.py\
--filename exp007\
--base_model xlm-roberta-base\
--best_epoch 3

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
device: cuda
seed: 42
competition_dir: /kaggle/input/learning-equality-curriculum-recommendations/
data_dir: /kaggle/input/2nd-stage-exp007/
topic_dir: /kaggle/working/1st_stage_model/
output_dir: /kaggle/working/2nd_stage_model/
filename: exp007
max_len: 256
base_model: xlm-roberta-base
batch_size: 64
target_cols: ['target']
n_fold: [0]
best_epoch: 3
id                      0
title                   0
description             0
kind                    0
text                    0
language                0
copyright_holder    82226
license             80012
content_sentence        0
dtype: int64
encode content_sentence: 100%|████████| 154047/154047 [00:28<00:00, 5343.36it/s]
encode topic_sentence: 100%|████████████████████| 5/5 [00:00<00:00, 1147.36it/s]
df.shape:  (250, 7)
topic_id            0
predictions         0
topic_sentence      0
topic_length        0


In [11]:
!python 2nd_stage_model.py\
--filename exp008\
--base_model sentence-transformers/paraphrase-multilingual-mpnet-base-v2\
--best_epoch 3

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
device: cuda
seed: 42
competition_dir: /kaggle/input/learning-equality-curriculum-recommendations/
data_dir: /kaggle/input/2nd-stage-exp008/
topic_dir: /kaggle/working/1st_stage_model/
output_dir: /kaggle/working/2nd_stage_model/
filename: exp008
max_len: 256
base_model: sentence-transformers/paraphrase-multilingual-mpnet-base-v2
batch_size: 64
target_cols: ['target']
n_fold: [0]
best_epoch: 3
id                      0
title                   0
description             0
kind                    0
text                    0
language                0
copyright_holder    82226
license             80012
content_sentence        0
dtype: int64
encode content_sentence: 100%|████████| 154047/154047 [00:28<00:00, 5487.73it/s]
encode topic_sentence: 100%|████████████████████| 5/5 [00:00<00:00, 1095.40it/s]
df.shape:  (250, 7)
topic_id            0
predictions         0
top

In [12]:
!python 3rd_stage_model.py\
--filenames exp004 exp006 exp007 exp008\
--threshold 0.1\
--add_top 1

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
seed: 42
input_dir: /kaggle/working/
output_dir: /kaggle/working/
pair_dir: /kaggle/working/1st_stage_model/
competition_dir: /kaggle/input/learning-equality-curriculum-recommendations/
data_dir: /kaggle/input/3rd-stage-model/
filenames: ['exp004', 'exp006', 'exp007', 'exp008']
threshold: 0.1
add_top: 1
100%|█████████████████████████████████████████████| 3/3 [00:00<00:00, 57.97it/s]
[0.51103137 0.73713227 0.09132739 0.05130519 0.21499051 0.0311109
 0.04670557 0.02644748 0.06166608 0.01587486 0.08213433 0.02167364
 0.0261566  0.02579991 0.03905639 0.02532873 0.02702754 0.03703016
 0.04578561 0.04587744 0.030566   0.02216115 0.0153179  0.02826464
 0.01696076 0.02311777 0.02024259 0.02239036 0.02848879 0.02160788
 0.02094879 0.01812745 0.01574937 0.38935418 0.01325655 0.01673222
 0.01460603 0.01618934 0.03534195 0.02950377 0.01626258 0.01459344
 0.02418695 0.00954749 0.01237

In [13]:
import pandas as pd
pd.read_csv("submission.csv")

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_76231f9d0b5e c_376c5a8eb028 c...
1,t_00068291e9a4,c_ac1672cdcd2c c_89ce9367be10 c_ebb7fdf10a7e c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_5e375cf14c47 c_d7a0d7eaf799 c_1c57a1316568 c...
4,t_4054df11a74e,c_f2d184a98231 c_3695c5dc1df6
