In [1]:
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import nvidia_smi
from tqdm.notebook import tqdm

In [2]:
from utils import get_topic_context, get_context_df

# Creating & Testing Function to Extract Topic Parent Nodes from a Topic Node

In [3]:
content = pd.read_csv("../../../input/content.csv")
correlations = pd.read_csv("../../../input/correlations.csv")
topics = pd.read_csv("../../../input/topics.csv")
sub = pd.read_csv("../../../input/sample_submission.csv")

In [9]:
topics

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
3,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
...,...,...,...,...,...,...,...,...,...
76967,t_fffb0bf2801d,4.3 Graph of functions,,e77b55,aligned,4,en,t_676e6a1a4dc7,False
76968,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True
76969,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True
76970,t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,5b9e5ca86571f90499ea987f,9fd860,source,2,ar,t_5b4f3ba4eb7d,True


In [5]:
content

Unnamed: 0,id,title,description,kind,text,language,copyright_holder,license
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",video,,es,,
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,video,,it,,
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,video,,es,,
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,document,\nNado de aproximação\nSaber nadar nas ondas ...,pt,Sikana Education,CC BY-NC-ND
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,document,Estándares Comunes del Estado de Nueva York\n\...,es,Engage NY,CC BY-NC-SA
...,...,...,...,...,...,...,...,...
154042,c_fffcbdd4de8b,2. 12: Diffusion,,html5,What will eventually happen to these dyes?\n\n...,en,CSU and Merlot,CC BY-NC-SA
154043,c_fffe15a2d069,Sommare facendo gruppi da 10,Sal somma 5+68 spezzando il 5 in un 2 e un 3.\n\n,video,,it,,
154044,c_fffed7b0d13a,Introdução à subtração,Sal fala sobre o que significa subtrair. Os ex...,video,,pt,,
154045,c_ffff04ba7ac7,SA of a Cone,,video,,en,,


In [4]:
content.shape, correlations.shape, topics.shape, sub.shape

((154047, 8), (61517, 2), (76972, 9), (5, 2))

In [8]:
train = pd.read_csv("../../../input/archive/train.csv")
train.head()

Unnamed: 0,topics_ids,content_ids,title1,title2,target
0,t_3d9ad9931021,c_efb73ad83f4b,,,0
1,t_3d9ad9931021,c_77574ef20c1f,,,0
2,t_3d9ad9931021,c_200ae87baf4d,,,0
3,t_3d9ad9931021,c_87e171afe50b,,,0
4,t_3d9ad9931021,c_3c070b63a944,,,0


In [6]:
train.rename(
    columns=({ 'title1': 'topic_title', 'title2': 'content_title'}), 
    inplace=True,
)

In [7]:
train.insert(3, "topic_description", np.full(len(train), np.nan))
train.insert(5, "content_description", np.full(len(train), np.nan))
train.insert(6, "content_text", np.full(len(train), np.nan))
train.insert(7, "topic_parent_title", np.full(len(train), np.nan))
train.insert(8, "topic_parent_description", np.full(len(train), np.nan))
train.insert(9, "topic_child_title", np.full(len(train), np.nan))
train.insert(10, "topic_child_description", np.full(len(train), np.nan))

In [8]:
train.head()

Unnamed: 0,topics_ids,content_ids,topic_title,topic_description,content_title,content_description,content_text,topic_parent_title,topic_parent_description,topic_child_title,topic_child_description,target
0,t_3d9ad9931021,c_efb73ad83f4b,,,,,,,,,,0
1,t_3d9ad9931021,c_77574ef20c1f,,,,,,,,,,0
2,t_3d9ad9931021,c_200ae87baf4d,,,,,,,,,,0
3,t_3d9ad9931021,c_87e171afe50b,,,,,,,,,,0
4,t_3d9ad9931021,c_3c070b63a944,,,,,,,,,,0


In [9]:
train.topic_description = train.merge(topics[["id", "description"]], 
                                      how="left", left_on="topics_ids", 
                                      right_on="id")["description"]
train.head()

Unnamed: 0,topics_ids,content_ids,topic_title,topic_description,content_title,content_description,content_text,topic_parent_title,topic_parent_description,topic_child_title,topic_child_description,target
0,t_3d9ad9931021,c_efb73ad83f4b,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,,,,,,,0
1,t_3d9ad9931021,c_77574ef20c1f,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,,,,,,,0
2,t_3d9ad9931021,c_200ae87baf4d,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,,,,,,,0
3,t_3d9ad9931021,c_87e171afe50b,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,,,,,,,0
4,t_3d9ad9931021,c_3c070b63a944,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,,,,,,,0


In [10]:
train.content_description = train.merge(content[["id", "description"]], 
                                      how="left", left_on="content_ids", 
                                      right_on="id")["description"]
train.head()

Unnamed: 0,topics_ids,content_ids,topic_title,topic_description,content_title,content_description,content_text,topic_parent_title,topic_parent_description,topic_child_title,topic_child_description,target
0,t_3d9ad9931021,c_efb73ad83f4b,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,,,,,,,0
1,t_3d9ad9931021,c_77574ef20c1f,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,Recurso Educativo Abierto para Ciencias Natura...,,,,,,0
2,t_3d9ad9931021,c_200ae87baf4d,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,Recurso Educativo Abierto para Geografía e His...,,,,,,0
3,t_3d9ad9931021,c_87e171afe50b,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,,,,,,,0
4,t_3d9ad9931021,c_3c070b63a944,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,,,,,,,0


In [11]:
train.content_text = train.merge(content[["id", "text"]], 
                                      how="left", left_on="content_ids", 
                                      right_on="id")["text"]
train.head()

Unnamed: 0,topics_ids,content_ids,topic_title,topic_description,content_title,content_description,content_text,topic_parent_title,topic_parent_description,topic_child_title,topic_child_description,target
0,t_3d9ad9931021,c_efb73ad83f4b,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,,Orientaciones para el profesorado\nOrientacion...,,,,,0
1,t_3d9ad9931021,c_77574ef20c1f,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,Recurso Educativo Abierto para Ciencias Natura...,"¿Es magia? No, son reacciones químicas\n¿Es ma...",,,,,0
2,t_3d9ad9931021,c_200ae87baf4d,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,Recurso Educativo Abierto para Geografía e His...,La economía y yo\nLa economía y yo\nObra publi...,,,,,0
3,t_3d9ad9931021,c_87e171afe50b,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,,La curiosidad es saludable\nLa curiosidad es s...,,,,,0
4,t_3d9ad9931021,c_3c070b63a944,,BC: BIOL 2 - Introduction to Human Biology (Gr...,,,"Palabra de caballero\nPalabra de caballero\n""P...",,,,,0


In [12]:
def get_topic_context(topic_id, topics, max_parent_nodes=-1, max_child_nodes=-1):
    parents, children = [], []
    
    # Traverse upwards.
    cnt = 0
    tmp = topics[topics["id"]==topic_id]
    while not tmp.parent.isna().values[0]:
        tmp = topics[topics["id"]==tmp.parent.values[0]]
        parents.append((tmp.title.values[0], tmp.description.values[0]))
        
        if max_parent_nodes > 0:
            cnt += 1
            if cnt == max_parent_nodes: break
        
    # Traverse downwards.
    cnt = 0
    tmp = topics[topics["parent"]==topic_id]
    
    stack = []

    # Populate initial stack.
    for i in range(len(tmp)-1, -1, -1):
        stack.append(tmp.iloc[i])
            
    # Traverse.
    while len(stack) > 0:
        row = stack.pop()
        children.append((row.title, row.description))

        if max_child_nodes > 0:
            cnt += 1
            if cnt == max_child_nodes: break
        
        tmp = topics[topics["parent"]==row["id"]]
        if not tmp.empty:
            for i in range(len(tmp)-1, -1, -1):
                stack.append(tmp.iloc[i])
            
    # Current topic node.
    tmp = topics[topics["id"]==topic_id]
    curr = [(tmp.title.values[0], tmp.description.values[0])]
            
    return parents, curr, children

In [13]:
parents, curr, children = get_topic_context("t_b7da63fc32b8", topics)
len(parents), len(curr), len(children)

(0, 1, 19)

In [14]:
def get_context_df(topics, max_parent_nodes=-1, max_child_nodes=-1):
    context_df = {
        "topics_ids": [],
        "topic_parent_title": [],
        "topic_parent_description": [],
        "topic_child_title": [],
        "topic_child_description": [] 
    }

    for topic_id in tqdm(topics.id, leave=True, position=0, total=len(topics.id)):

        parents, _, children = get_topic_context(topic_id, topics, max_parent_nodes, max_child_nodes)

        # Add parent to df.
        parent_title_str = ""
        parent_desc_str = ""
        for title, desc in parents:
            if title is not np.nan:
                parent_title_str += title + " [SEP] "
            if desc is not np.nan:
                parent_desc_str += desc + " [SEP] "

        parent_title_str = parent_title_str.strip()
        parent_desc_str = parent_desc_str.strip()

        parent_title_str = np.nan if parent_title_str == "" else parent_title_str
        parent_desc_str = np.nan if parent_desc_str == "" else parent_desc_str

        # Add children to df.
        child_title_str = ""
        child_desc_str = ""
        for title, desc in children:
            if title is not np.nan:
                child_title_str += title + " [SEP] "
            if desc is not np.nan:
                child_desc_str += desc + " [SEP] "

        child_title_str = child_title_str.strip()
        child_desc_str = child_desc_str.strip()

        child_title_str = np.nan if child_title_str == "" else child_title_str
        child_desc_str = np.nan if child_desc_str == "" else child_desc_str

        context_df["topics_ids"].append(topic_id)
        context_df["topic_parent_title"].append(parent_title_str)
        context_df["topic_parent_description"].append(parent_desc_str)
        context_df["topic_child_title"].append(child_title_str)
        context_df["topic_child_description"].append(child_desc_str)

    return pd.DataFrame(context_df)

In [16]:
context_df = get_context_df(topics, max_parent_nodes=10, max_child_nodes=5)

  0%|          | 0/76972 [00:00<?, ?it/s]

In [17]:
context_df.to_csv("../../input/topic_context.csv", index=False)

In [15]:
# SKIP THIS CELL

child_title_str = ""
child_desc_str = ""
for title, desc in children:
    if title is not np.nan:
        child_title_str += title + " [SEP] "
    if desc is not np.nan:
        child_desc_str += desc + " [SEP] "

child_title_str = child_title_str.strip()
child_desc_str = child_desc_str.strip()

# Go from "[SEP]" token delimiter to just a space.
child_title_str.replace("[SEP] ", "").strip()

'Physics Motion Heat and Thermodynamics Quantum Phenomena Work Energy and Power Electricity Magnets and Circuits Light and Radiation Sound and Waves By Level Elementary School High School University Middle School Earth Science Biology Chemistry Math Applications Concepts [SEP]'

In [23]:
train[["topic_parent_title", 
       "topic_parent_description", 
       "topic_child_title", 
       "topic_child_description"]] = train.merge(context_df, how="left", on="topics_ids")[
    ["topic_parent_title_y", 
     "topic_parent_description_y", 
     "topic_child_title_y", 
     "topic_child_description_y"]]

In [25]:
train.to_csv("../../input/context_train.csv", index=False)