# Explore the Tree

In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display, Markdown
from pathlib import Path
from tqdm.notebook import tqdm

In [2]:
topics_df = pd.read_csv("topics.csv", index_col=0).fillna({"title": "", "description": ""})
content_df = pd.read_csv("content.csv", index_col=0).fillna("")
correlations_df = pd.read_csv("correlations.csv", index_col=0)

In [3]:
# define some helper functions and classes to aid with data traversal

def print_markdown(md):
    display(Markdown(md))

class Topic:
    def __init__(self, topic_id):
        self.id = topic_id

    @property
    def parent(self):
        parent_id = topics_df.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            parent = parent.parent
        return ancestors

    @property
    def siblings(self):
        if not self.parent:
            return []
        else:
            return [topic for topic in self.parent.children if topic != self]

    @property
    def content(self):
        if self.id in correlations_df.index:
            return [ContentItem(content_id) for content_id in correlations_df.loc[self.id].content_ids.split()]
        else:
            return tuple([]) if self.has_content else []

    def get_breadcrumbs(self, separator=" | ", include_self=False, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        return separator.join(reversed([a.title for a in ancestors]))

    @property
    def children(self):
        return [Topic(child_id) for child_id in topics_df[topics_df.parent == self.id].index]

    def subtree_markdown(self, depth=0):
        markdown = "  " * depth + "- " + self.title + "\n"
        for child in self.children:
            markdown += child.subtree_markdown(depth=depth + 1)
        for content in self.content:
            markdown += ("  " * (depth + 1) + "- " + "[" + content.kind.title() + "] " + content.title) + "\n"
        return markdown

    def __eq__(self, other):
        if not isinstance(other, Topic):
            return False
        return self.id == other.id

    def __getattr__(self, name):
        return topics_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<Topic(id={self.id}, title=\"{self.title}\")>"


class ContentItem:
    def __init__(self, content_id):
        self.id = content_id

    @property
    def topics(self):
        return [Topic(topic_id) for topic_id in topics_df.loc[correlations_df[correlations_df.content_ids.str.contains(self.id)].index].index]

    def __getattr__(self, name):
        return content_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<ContentItem(id={self.id}, title=\"{self.title}\")>"

    def __eq__(self, other):
        if not isinstance(other, ContentItem):
            return False
        return self.id == other.id

    def get_all_breadcrumbs(self, separator=" >> ", include_root=True):
        breadcrumbs = []
        for topic in self.topics:
            new_breadcrumb = topic.get_breadcrumbs(separator=separator, include_root=include_root)
            if new_breadcrumb:
                new_breadcrumb = new_breadcrumb + separator + self.title
            else:
                new_breadcrumb = self.title
            breadcrumbs.append(new_breadcrumb)
        return breadcrumbs

In [4]:
topic = Topic("t_5b97d30723c2")
print("Content title:\t'" + topic.content[0].title + "' [kind: " + topic.content[0].kind + "]")
print("Topic title:\t'" + topic.title + "'")
print("Breadcrumbs:\t" + topic.get_breadcrumbs())

Content title:	'Documento 3' [kind: document]
Topic title:	'Dominio disperso durante el año: Gente importante'
Breadcrumbs:	EngageNY (es) | Prekindergarten Artes del lenguaje en inglés | Escuchar y aprender Strand


In [16]:
breadcrumbs_df = pd.DataFrame()
breadcrumbs_df['topic_id'] = correlations_df.index

In [21]:
breadcrumbs = []
for index, row in tqdm(breadcrumbs_df.iterrows(), total=breadcrumbs_df.shape[0]):
    tmp = Topic(row['topic_id'])
    breadcrumbs.append(tmp.get_breadcrumbs())

  0%|          | 0/61517 [00:00<?, ?it/s]

In [22]:
breadcrumbs_df['tree_context'] = breadcrumbs
breadcrumbs_df

Unnamed: 0,topic_id,tree_context
0,t_00004da3a1b2,Khan Academy (български език) | Наука | Физика...
1,t_00068291e9a4,Khan Academy (Português (Brasil)) | Matemática...
2,t_00069b63a70a,MIT Blossoms | Engineering | Flow Charts: Logi...
3,t_0006d41a73a8,Khan Academy (български език) | Математика | А...
4,t_0008768bdee6,DIGITAL EDUCATION WITH MEHUL | ધોરણ ૩ | પ્રારં...
...,...,...
61512,t_fff830472691,K-12 | Math | Analysis | Vector Analysis
61513,t_fff9e5407d13,PF (Español) | Lengua española
61514,t_fffbe1d5d43c,Khan Academy (Kiswahili) | Hisabati | Class 9 ...
61515,t_fffe14f1be1e,CREE | Para el Estudiante | I Ciclo | 01 Prime...


# Step 1: Get our retriever

Use positive data to finetune a pre-trained sentence transformer model. Sentence transformer model will automatically use in-batch negative sampling.

1. model: paraphrase-multilingual-mpnet-base-v2

2. data features: only title

In [23]:
!pip -qqq install sentence-transformers
!pip -qqq install datasets
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer, models, InputExample, losses
from datasets import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold

In [24]:
topics = pd.read_csv("topics.csv")
content = pd.read_csv("content.csv")
correlations = pd.read_csv("correlations.csv")

In [25]:
def cv_split(train, n_folds, seed):
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    for num, (train_index, val_index) in enumerate(kfold.split(train)):
        train.loc[val_index, 'fold'] = int(num)
    train['fold'] = train['fold'].astype(int)
    return train

In [26]:
kfolds = cv_split(correlations, 5, 1006)

In [None]:
# read from file
kfolds = pd.read_csv('kfold_correlations_exp21.csv')
kfolds.head()

In [None]:
correlations = kfolds[kfolds.fold!=0]

In [27]:
topics.rename(columns=lambda x: "topic_" + x, inplace=True)
content.rename(columns=lambda x: "content_" + x, inplace=True)

In [28]:
correlations["content_id"] = correlations["content_ids"].str.split(" ")
corr = correlations.explode("content_id").drop(columns=["content_ids"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  correlations["content_id"] = correlations["content_ids"].str.split(" ")


In [29]:
corr = corr.merge(topics, how="left", on="topic_id")
corr = corr.merge(content, how="left", on="content_id")

In [31]:
corr.head()

Unnamed: 0,topic_id,fold,content_id,topic_title,topic_description,topic_channel,topic_category,topic_level,topic_language,topic_parent,topic_has_content,content_title,content_description,content_kind,content_text,content_language,content_copyright_holder,content_license
0,t_00004da3a1b2,1,c_1108dd0c7a5d,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Молив като резистор,"Моливът причинява промяна в отклонението, подо...",video,,bg,,
1,t_00004da3a1b2,1,c_376c5a8eb028,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Да чуем променливото съпротивление,Тук чертаем линия на лист хартия и я използвам...,video,,bg,,
2,t_00004da3a1b2,1,c_5bc0e1e2cba0,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Променлив резистор (реостат) с графит от молив,Използваме сърцевината на молива (неговия граф...,video,,bg,,
3,t_00004da3a1b2,1,c_76231f9d0b5e,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True,Последователно свързване на галваничен елемент...,"Защо отклонението се променя, когато се свърже...",video,,bg,,
4,t_00068291e9a4,1,c_639ea2ef9c95,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True,Dados e resultados de funções: gráficos,Encontre todas as entradas que correspondem a ...,exercise,,pt,,


加上tree结构：

- 对于topic：title + description + tree
- 对于content：title + description + text

In [34]:
corr = corr.fillna('')
corr = corr.merge(breadcrumbs_df, how="left", on="topic_id")

In [36]:
corr['topic_full_text'] = corr['topic_title'] + '[SEP]' + corr['topic_description'] + '[SEP]' + corr['tree_context']
corr['content_full_text'] = corr['content_title'] + '[SEP]' + corr['content_description'] + '[SEP]' + corr['content_text']
corr.head()

Unnamed: 0,topic_id,fold,content_id,topic_title,topic_description,topic_channel,topic_category,topic_level,topic_language,topic_parent,...,content_title,content_description,content_kind,content_text,content_language,content_copyright_holder,content_license,tree_context,topic_full_text,content_full_text
0,t_00004da3a1b2,1,c_1108dd0c7a5d,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,...,Молив като резистор,"Моливът причинява промяна в отклонението, подо...",video,,bg,,,Khan Academy (български език) | Наука | Физика...,Откриването на резисторите[SEP]Изследване на м...,Молив като резистор[SEP]Моливът причинява пром...
1,t_00004da3a1b2,1,c_376c5a8eb028,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,...,Да чуем променливото съпротивление,Тук чертаем линия на лист хартия и я използвам...,video,,bg,,,Khan Academy (български език) | Наука | Физика...,Откриването на резисторите[SEP]Изследване на м...,Да чуем променливото съпротивление[SEP]Тук чер...
2,t_00004da3a1b2,1,c_5bc0e1e2cba0,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,...,Променлив резистор (реостат) с графит от молив,Използваме сърцевината на молива (неговия граф...,video,,bg,,,Khan Academy (български език) | Наука | Физика...,Откриването на резисторите[SEP]Изследване на м...,Променлив резистор (реостат) с графит от молив...
3,t_00004da3a1b2,1,c_76231f9d0b5e,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,...,Последователно свързване на галваничен елемент...,"Защо отклонението се променя, когато се свърже...",video,,bg,,,Khan Academy (български език) | Наука | Физика...,Откриването на резисторите[SEP]Изследване на м...,Последователно свързване на галваничен елемент...
4,t_00068291e9a4,1,c_639ea2ef9c95,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,...,Dados e resultados de funções: gráficos,Encontre todas as entradas que correspondem a ...,exercise,,pt,,,Khan Academy (Português (Brasil)) | Matemática...,Entradas e saídas de uma função[SEP]Entenda um...,Dados e resultados de funções: gráficos[SEP]En...


In [37]:
corr["set"] = corr[["topic_full_text", "content_full_text"]].values.tolist()  # use only title here
train_df = pd.DataFrame(corr["set"])

In [38]:
dataset = Dataset.from_pandas(train_df)

In [39]:
train_examples = []
train_data = dataset["set"]
n_examples = dataset.num_rows

for i in range(n_examples):
    example = train_data[i]
    if example[0] == None: #remove None
        continue        
    train_examples.append(InputExample(texts=[str(example[0]), str(example[1])]))

In [40]:
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

Downloading (…)9e268/.gitattributes: 100%|██████████| 690/690 [00:00<00:00, 48.9kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 24.7kB/s]
Downloading (…)f2cd19e268/README.md: 100%|██████████| 3.77k/3.77k [00:00<00:00, 494kB/s]
Downloading (…)cd19e268/config.json: 100%|██████████| 723/723 [00:00<00:00, 117kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 26.4kB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 1.11G/1.11G [00:57<00:00, 19.4MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 9.44kB/s]
Downloading (…)ncepiece.bpe.model";: 100%|██████████| 5.07M/5.07M [00:00<00:00, 7.19MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 35.6kB/s]
Downloading (…)9e268/tokenizer.json: 100%|██████████| 9.08M/9.08M [01:40<00:00, 90.5kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 402/402 [00:00<00:00, 60.5kB/s]
Downloading (…)d19e268/modules.json: 1

In [41]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
num_epochs = 10
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          save_best_model = True,
          output_path='autodl-tmp/paraphrase-multilingual-mpnet-base-v2-exp21_fold0_epochs10',
          warmup_steps=warmup_steps)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]
Iteration:   0%|          | 0/3523 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/3523 [00:00<35:08,  1.67it/s][A
Iteration:   0%|          | 2/3523 [00:01<33:30,  1.75it/s][A
Iteration:   0%|          | 3/3523 [00:01<32:31,  1.80it/s][A
Iteration:   0%|          | 4/3523 [00:02<32:44,  1.79it/s][A
Iteration:   0%|          | 5/3523 [00:02<32:21,  1.81it/s][A
Iteration:   0%|          | 6/3523 [00:03<31:32,  1.86it/s][A
Iteration:   0%|          | 7/3523 [00:03<31:27,  1.86it/s][A
Iteration:   0%|          | 8/3523 [00:04<31:06,  1.88it/s][A
Iteration:   0%|          | 9/3523 [00:04<30:47,  1.90it/s][A
Iteration:   0%|          | 10/3523 [00:05<30:27,  1.92it/s][A
Iteration:   0%|          | 11/3523 [00:05<30:14,  1.94it/s][A
Iteration:   0%|          | 12/3523 [00:06<29:59,  1.95it/s][A
Iteration:   0%|          | 13/3523 [00:06<30:17,  1.93it/s][A
Iteration:   0%|          | 14/3523 [00:07<30:32,  1.92it/s][A
Iteration:  

Save kfold for the next step, since we use fold=0 for validation.

In [44]:
kfolds.to_csv('kfold_correlations_exp21.csv', index=0)