In [1]:
import os
import pandas as pd
from multiprocessing import Pool
from xml.etree.ElementTree import iterparse


In [2]:
from utils import paths, read, save
from utils.consts import n_procs

post_types = {"questions": 1, "answers": 2}

In [3]:
def unpack_gamedev_se():
    """Runs the bash script to unpack data from the 7z file
    and creates a Posts.7z and a PostLinks.7z file.
    """
    print("- Unpacking Game Dev. Stack Exchange data")
    data_dir = paths.raw_dir("gamedev_se")
    # redirects 7z output to keep terminal clean
    os.system(f"cd {data_dir} ; bash unpack_gamedev_se.sh > /dev/null")

    

In [4]:
def extract_posts(ds):
    """Decompresses the Posts.7z archive for a given dataset"""
    print("-- Decompressing Posts.7z")
    data_dir = paths.raw_dir(ds)
    file = data_dir / "Posts.7z"
    # redirects 7z output to keep terminal clean
    os.system(f"7z x {file} -o{data_dir} -y > /dev/null")

In [5]:
def extract_post_links(ds):
    """Decompresses the PostLinks.7z archive for a given dataset"""
    print("-- Decompressing PostLinks.7z")
    data_dir = paths.raw_dir(ds)
    file = data_dir / "PostLinks.7z"
    # redirects 7z output to keep terminal clean
    os.system(f"7z x {file} -o{data_dir} -y > /dev/null")

In [6]:
def select_post_type(ds, post_type):
    """Selects posts from a given type from Posts.xml for a given dataset
    Questions = post_type == 1
    Answers = post_type == 2
    """
    extract_posts(ds)
    posts_path = paths.posts_xml(ds)

    if post_type == post_types["questions"]:
        print("-- Selecting questions from Posts.xml")
        path = paths.questions_xml(ds)
    else:
        print("-- Selecting answers from Posts.xml")
        path = paths.answers_xml(ds)

    # select using grep for performance
    os.system(f"grep -F 'PostTypeId=\"{post_type}\"' {posts_path} > {path}")
    # deleted Posts.xml to save space
    # saves space but has to decompress the 7z archive twice
    posts_path.unlink()

In [7]:

def split_xml_file(ds, post_type):
    """Splits a XML file into chunks of 1MM lines to allow
    multiprocessing and to limit memory usage.
    """
    data_dir = paths.raw_dir(ds)

    if post_type == post_types["questions"]:
        print("-- Splitting questions into chunks")
        file_name = "questions"
        path = paths.questions_xml(ds)
    else:
        print("-- Splitting answers into chunks")
        file_name = "answers"
        path = paths.answers_xml(ds)

    # reduce number of lines to save memory during parsing
    os.system(f"split -l 1000000 {path} {data_dir}/{file_name}_")
    path.unlink()  # Remove original XML file to save space

    splits = list(paths.raw_dir(ds).glob(f"{file_name}_*"))

    print(f"-- {len(splits)} splits")

    # wrap file in tags for proper XML syntax
    for path in splits:
        os.system(f"sed -i '1s/^/<posts>\\n/' {path}")  # top tag
        os.system(f"echo '</posts>' >> {path}")  # bottom tag

    return splits

In [8]:
def parse_questions_xml(questions_path):
    """Parses XML files containing question data"""
    questions = []

    for _, node in iterparse(questions_path, events=("end",)):
        if node.tag == "row":  # ignore starting and ending <post> tags
            questions.append(
                {
                    "id": node.attrib.get("Id"),
                    "title": node.attrib.get("Title"),
                    "body": node.attrib.get("Body"),
                    "tags": node.attrib.get("Tags"),
                    "accepted_answer": node.attrib.get("AcceptedAnswerId"),
                    "n_answers": node.attrib.get("AnswerCount"),
                }
            )
        node.clear()

    questions = pd.DataFrame(questions)
    # select accepted answers to append to answer data later
    accepted_answers = questions[["accepted_answer"]].dropna()
    questions = questions[["id", "n_answers", "title", "body", "tags"]]

    questions = questions.drop_duplicates("id")
    questions["n_answers"] = questions["n_answers"].apply(int)

    # make the string of tags comma separated ("<tag1><tag2>" -> "tag1,tag2")
    split_tag = lambda s: s.replace("><", ",").replace("<", "").replace(">", "")
    questions.tags = questions.tags.apply(split_tag)

    return questions, accepted_answers


def question_parser(i, ds, questions_xml):
    """Function for one subprocess parsing XML question data
    i -> number of the worker
    """
    print(f"--- Worker {i} started")
    questions, acc_ids = parse_questions_xml(questions_xml)

    qids = questions[["id"]]  # save question IDs to select dup pairs later

    save(questions, paths.question_texts(ds, i))
    # Remove the raw XML file to save space
    questions_xml.unlink()

    return qids, acc_ids


def extract_questions_xml(ds):
    """Extract all questions from XML archives for a given dataset"""
    print("- Extracting questions")
    post_type = post_types["questions"]

    select_post_type(ds, post_type)
    splits = split_xml_file(ds, post_type)

    print("-- Parsing questions")

    # multiprocess to increase speed. Reduce n_procs to save on memory
    args = [(i, ds, s) for i, s in enumerate(splits)]
    with Pool(n_procs) as p:
        res = p.starmap(question_parser, args)

    # suffix is unnecessary if there is only one split (less than 1M questions)
    if len(splits) == 1:
        paths.question_texts(ds, 0).rename(paths.question_texts(ds))

    # save list of all question ids
    qids = [r[0] for r in res]
    qids = pd.concat(qids).reset_index(drop=True)
    qids = qids.drop_duplicates()
    save(qids, paths.all_question_ids(ds))

    # save list of all accepted answers
    acc_ids = [r[1] for r in res]
    acc_ids = pd.concat(acc_ids).reset_index(drop=True)
    acc_ids = acc_ids.drop_duplicates()
    acc_ids = acc_ids.rename(columns={"accepted_answer": "id"})
    acc_ids = acc_ids[["id"]]
    save(acc_ids, paths.accepted_answer_ids(ds))


def parse_postlinks_xml(ds):
    """Parse and select all duplicate relations from PostLinks.xml for a given dataset"""
    pairs = []
    links_xml = paths.post_links_xml(ds)

    for _, node in iterparse(links_xml, events=("end",)):
        # LinkTypeId for duplicate relation is 3
        if node.attrib.get("LinkTypeId") == "3":
            pairs.append(
                {
                    "dup_id": node.attrib.get("PostId"),
                    "main_id": node.attrib.get("RelatedPostId"),
                }
            )
        node.clear()

    pairs = pd.DataFrame(pairs)

    return pairs


def extract_dup_pairs_xml(ds):
    """Extract duplicate question pairs from XML archives for a given dataset"""
    print("- Extracting duplicate pairs")
    extract_post_links(ds)

    print("-- Parsing duplicate pairs")
    dup_pairs = parse_postlinks_xml(ds)

    # Select only the pairs that have both questions in the set of question IDs
    qids = read(paths.all_question_ids(ds)).id

    dup_in_qs = dup_pairs.dup_id.isin(qids)
    main_in_qs = dup_pairs.main_id.isin(qids)

    dup_pairs = dup_pairs[dup_in_qs & main_in_qs]
    dup_pairs = dup_pairs.drop_duplicates()

    save(dup_pairs, paths.dup_pairs(ds))

    # remove PostLinks.xml to save space
    paths.post_links_xml(ds).unlink()


def parse_answers_xml(ds, answers_path):
    """Parses XML files containing answer data"""
    qids = read(paths.all_question_ids(ds)).id
    acc_ids = read(paths.accepted_answer_ids(ds)).id

    answers = []

    for _, node in iterparse(answers_path, events=("end",)):
        if node.tag == "row":
            answers.append(
                {
                    "id": node.attrib.get("Id"),
                    "question_id": node.attrib.get("ParentId"),
                    "score": node.attrib.get("Score"),
                    "body": node.attrib.get("Body"),
                    "post_date": node.attrib.get("CreationDate"),
                }
            )
        node.clear()

    answers = pd.DataFrame(answers)

    # only select answers that have a question in the dataset
    answers = answers[answers.question_id.isin(qids)]

    answers["score"] = answers.score.apply(int)

    # mark answers as accepted
    answers.loc[answers.id.isin(acc_ids), "accepted"] = True
    answers["accepted"] = answers["accepted"].fillna(False)

    return answers


def answer_parser(i, ds, answers_xml):
    """Function for one subprocess parsing XML answer data
    i -> number of the worker
    """
    print(f"--- Worker {i} started")
    answers = parse_answers_xml(ds, answers_xml)

    save_path = paths.answer_texts(ds, i)

    save(answers, save_path)
    # Removes XML file to save space
    answers_xml.unlink()


def extract_answers_xml(ds):
    """Extract all questions from XML archives for a given dataset"""
    print("- Extracting answers")

    post_type = post_types["answers"]

    select_post_type(ds, post_type)
    splits = split_xml_file(ds, post_type)

    print("-- Parsing answers")

    # multiprocess to increase speed. Reduce n_procs to save on memory
    args = [(i, ds, s) for i, s in enumerate(splits)]
    with Pool(n_procs) as p:
        p.starmap(answer_parser, args)

    # suffix is unnecessary if there is only one split (less than 1M questions)
    if len(splits) == 1:
        paths.answer_texts(ds, 0).rename(paths.answer_texts(ds))

    # remove accepted answer IDs file as it won't be used anymore
    paths.accepted_answer_ids(ds).unlink()


def extract_xml(ds):
    """Extract questions, answers and dup pairs from an XML archive"""
    extract_questions_xml(ds)
    extract_dup_pairs_xml(ds)
    extract_answers_xml(ds)


def extract_xml_datasets(datasets):
    unpack_gamedev_se()
    for ds in datasets:
        print(f"Extracting data from {ds}")
        extract_xml(ds)


if __name__ == "__main__":
    print("Extracting XML data")
    extract_xml_datasets(["gamedev_se", "stackoverflow"])


Extracting XML data
- Unpacking Game Dev. Stack Exchange data
Extracting data from gamedev_se
- Extracting questions
-- Decompressing Posts.7z
-- Selecting questions from Posts.xml
-- Splitting questions into chunks
-- 1 splits
-- Parsing questions
--- Worker 0 started
- Extracting duplicate pairs
-- Decompressing PostLinks.7z
-- Parsing duplicate pairs
- Extracting answers
-- Decompressing Posts.7z
-- Selecting answers from Posts.xml
-- Splitting answers into chunks
-- 1 splits
-- Parsing answers
--- Worker 0 started
Extracting data from stackoverflow
- Extracting questions
-- Decompressing Posts.7z
-- Selecting questions from Posts.xml
-- Splitting questions into chunks
-- 1 splits
-- Parsing questions
--- Worker 0 started
- Extracting duplicate pairs
-- Decompressing PostLinks.7z
-- Parsing duplicate pairs
- Extracting answers
-- Decompressing Posts.7z
-- Selecting answers from Posts.xml
-- Splitting answers into chunks
-- 1 splits
-- Parsing answers
--- Worker 0 started


In [9]:
import pandas as pd

from utils import paths, save, read
from utils.consts import gamedev_tags, so_sample_seeds


def sample_stackoverflow(sample_num, seed):
    """Creates a sample from the StackOverflow dataset of similar size to the game dev. datasets"""

    def select_dup_pairs():
        """Selects a number of dup pairs equal to the mean of dup pairs in game dev datasets"""
        print("-- Selecting dup pairs")
        len_pairs_se = len(read(paths.dup_pairs("gamedev_se")))
        len_pairs_so = len(read(paths.dup_pairs("gamedev_so")))
        len_pairs = (len_pairs_se + len_pairs_so) // 2

        pairs = read(paths.dup_pairs("stackoverflow"))
        pairs = pairs.drop_duplicates()
        pairs = pairs.sample(n=len_pairs, random_state=seed).reset_index(drop=True)
        save(pairs, paths.dup_pairs(f"so_samples/sample_{sample_num}"))

    def select_questions():
        """Selects a number of questions equal to the mean questions in game dev datasets"""
        print("-- Selecting questions")
        pairs = read(paths.dup_pairs(f"so_samples/sample_{sample_num}"))

        len_se = len(read(paths.question_texts("gamedev_se")))
        len_so = len(read(paths.question_texts("gamedev_so")))
        sample_size = (len_se + len_so) // 2
        remaining_questions = sample_size

        dfs = []

        n_splits = len(list(paths.corpus_dir("stackoverflow").glob("question_texts*")))

        for i in range(n_splits):
            print(i, end="\r")
            # samples to select in this split
            n_samples = (sample_size // n_splits) + int(i < sample_size % n_splits)

            # if there is only one split we don't have suffixes
            if n_splits == 1:
                i = None

            df = read(paths.question_texts("stackoverflow", i))

            # questions in the pre-selected dup pairs
            is_in_pairs = df.id.isin(pairs.main_id) | df.id.isin(pairs.dup_id)

            df_dups = df[is_in_pairs]
            # questions not in pairs
            df = df[~is_in_pairs]

            # random questions to sample from this split
            to_sample = n_samples - len(df_dups)
            sample = df.sample(n=to_sample, random_state=seed)

            df = pd.concat([sample, df_dups]).reset_index(drop=True)
            dfs.append(df)
            print(" " * 50, end="\r")

        df = pd.concat(dfs)
        df = df.drop_duplicates("id")
        df = df.reset_index(drop=True)

        save(df, paths.question_texts(f"so_samples/sample_{sample_num}"))

    def select_answers():
        """Selects only the answers that have questions in the sampled dataset"""
        print("-- Selecting answers")
        qids = read(paths.question_texts(f"so_samples/sample_{sample_num}")).id

        df = []
        n_splits = len(list(paths.corpus_dir("stackoverflow").glob("answer_texts*")))

        for i in range(n_splits):
            print(i, end="\r")

            # if there is only one split we don't have suffixes
            if n_splits == 1:
                i = None

            split = read(paths.answer_texts("stackoverflow", i))

            df_split = split[split.question_id.isin(qids)]
            df.append(df_split)
            print(" " * 50, end="\r")

        df = pd.concat(df)
        df = df.reset_index(drop=True)
        save(df, paths.answer_texts(f"so_samples/sample_{sample_num}"))

    select_dup_pairs()
    select_questions()
    select_answers()


def select_gamedev(tags):
    """Selects posts related to game dev based on the given tags"""

    def select_questions():
        """Selects game dev questions from each split if they contain one of the tags"""
        print("-- Selecting questions")
        df = []
        n_splits = len(list(paths.corpus_dir("stackoverflow").glob("question_texts*")))

        for i in range(n_splits):
            # if there is only one split we don't have suffixes
            if n_splits == 1:
                i = None

            split = read(paths.question_texts("stackoverflow", i))

            for t in tags:
                print(i, t, end="\r")
                # selects questions that have tag 't' in the list
                tag_in_list = lambda ts: t.lower() in ts.lower().split(",")
                df_tag = split[split.tags.apply(tag_in_list)]
                df.append(df_tag)
                print(" " * 50, end="\r")

        df = pd.concat(df)
        df = df.drop_duplicates("id").reset_index(drop=True)
        save(df, paths.question_texts("gamedev_so"))

    def select_answers():
        """Selects only the answers that have questions in the game dev dataset"""
        print("-- Selecting answers")
        qids = read(paths.question_texts("gamedev_so")).id

        df = []
        n_splits = len(list(paths.corpus_dir("stackoverflow").glob("answer_texts*")))

        for i in range(n_splits):
            print(i, end="\r")

            # if there is only one split we don't have suffixes
            if n_splits == 1:
                i = None

            split = read(paths.answer_texts("stackoverflow", i))

            df_split = split[split.question_id.isin(qids)]
            df.append(df_split)

        df = pd.concat(df)
        df = df.reset_index(drop=True)
        save(df, paths.answer_texts("gamedev_so"))

    def select_dup_pairs():
        """Selects only the dup pairs that have both questions in the game dev dataset"""
        print("-- Selecting dup pairs")
        qids = read(paths.question_texts("gamedev_so")).id

        pairs = read(paths.dup_pairs("stackoverflow"))
        pairs = pairs[pairs.main_id.isin(qids) & pairs.dup_id.isin(qids)]
        pairs = pairs.drop_duplicates()
        pairs = pairs.reset_index(drop=True)

        save(pairs, paths.dup_pairs("gamedev_so"))

    select_questions()
    select_answers()
    select_dup_pairs()


def select_so_samples(seeds):
    """Select one sample for each given seed"""
    for i, seed in enumerate(seeds):
        print(f"- Selecting sample {i}")
        sample_stackoverflow(i, seed)


def main(tags_gamedev, seeds):
    print("Selecting Game Dev. SO questions")
    select_gamedev(tags_gamedev)
    print("Sampling Stack Overflow")
    select_so_samples(seeds)


if __name__ == "__main__":
    main(gamedev_tags, so_sample_seeds)


Selecting Game Dev. SO questions
-- Selecting questions
-- Selecting answers                              
-- Selecting dup pairs
Sampling Stack Overflow
- Selecting sample 0
-- Selecting dup pairs
-- Selecting questions
-- Selecting answers                              
- Selecting sample 1                              
-- Selecting dup pairs
-- Selecting questions
-- Selecting answers                              
- Selecting sample 2                              
-- Selecting dup pairs
-- Selecting questions
-- Selecting answers                              
- Selecting sample 3                              
-- Selecting dup pairs
-- Selecting questions
-- Selecting answers                              
- Selecting sample 4                              
-- Selecting dup pairs
-- Selecting questions
-- Selecting answers                              
                                                  

In [10]:
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from multiprocessing import Pool
from gensim.parsing.preprocessing import preprocess_string

from utils import paths, read, save
from utils.consts import datasets, n_procs


def process_html(t):
    """Processes HTML text to replace or remove tags"""
    t = t.lower()
    t = re.sub(r"\n", " ", t)
    t = re.sub(r"<code>.*?</code>", " codesnippet ", t)  # replace code
    t = re.sub(r"<a.*?https?:\/\/.*?[\b\s]?>", " url ", t)  # replace urls
    t = re.sub(r"https?:\/\/.*?(?:[\b\s]|$)", " url ", t)  # replace urls
    t = re.sub(r"<img.*?>", " img ", t)  # replace images
    t = BeautifulSoup(t, features="lxml").get_text()  # remove html tags
    return t


def preprocess_texts(df):
    """Applies the process_html function to different question parts
    and merges them
    """
    df["body"] = df["body"].apply(process_html)
    df["title"] = df["title"].apply(process_html)
    df["answer"] = df["answer"].apply(process_html)
    df["title_body"] = df.title + " " + df.body
    df["title_body_tags"] = df.title_body + " " + df.tags
    df["title_body_tags_answer"] = df.title_body_tags + " " + df.answer
    return df


def tokenize_texts(df):
    """Tokenizes each question part using preprocess_string and merges them"""
    df["body"] = df["body"].apply(preprocess_string)
    df["title"] = df["title"].apply(preprocess_string)
    df["tags"] = df["tags"].apply(preprocess_string)
    df["answer"] = df["answer"].apply(preprocess_string)
    df["title_body"] = df.title + df.body
    df["title_body_tags"] = df.title_body + df.tags
    df["title_body_tags_answer"] = df.title_body_tags + df.answer
    return df


def map_pool(df, f):
    """Splits the dataframe into chunks and
    maps the function using multiprocessing
    """
    dfs = np.array_split(df, n_procs)

    with Pool(n_procs) as p:
        dfs = p.map(f, dfs)

    df = pd.concat(dfs)
    return df


def get_questions(ds):
    """Reads the set of questions for a dataset and normalizes it"""
    df = read(paths.question_texts(ds))

    assert not df.isna().any().any(), "NaN columns in data!"

    df.dropna(inplace=True)
    df = df.drop_duplicates("id")

    # get the ordered index of each question in the dataframe
    df = df.reset_index(drop=True).reset_index()
    df = df.rename(columns={"index": "corpus_index"})

    return df


def select_best_answers(answers):
    """Selects the best answer for a given question
    based on the heuristic:
    accepted answer > highest score > posted first
    """
    answers["max_score"] = answers.groupby("question_id").score.apply(
        lambda s: s == max(s)
    )
    answers["posted_first"] = answers.groupby("question_id").post_date.apply(
        lambda s: s == min(s)
    )

    # subset of answers that *can* be the best one
    best_answers = answers[answers.accepted | answers.max_score | answers.posted_first]

    # sort the answers according to the criteria
    sort_cols = ["accepted", "max_score", "posted_first"]
    best_answers = best_answers.sort_values(sort_cols, ascending=False)

    # the first sorted answer is the "best" one
    best_answers = best_answers.groupby("question_id").first().reset_index()

    assert (
        best_answers.groupby("question_id").apply(len) == 1
    ).all(), "Some questions have more than one answer!"
    assert answers.question_id.isin(best_answers.question_id).all(), "Missing answers!"
    assert (
        answers[answers.accepted].question_id.isin(best_answers.question_id).all()
    ), "Missing accepted answers!"

    return best_answers


def get_answers(ds):
    """Reads the set of answers for a dataset, selects the best one
    and normalizes them
    """
    df = read(paths.answer_texts(ds))
    df = df.drop_duplicates()
    df = select_best_answers(df)
    df = df[["question_id", "body"]]
    df.columns = ["id", "answer"]
    df = df.reset_index(drop=True)
    return df


def questions_with_answers(ds):
    """Reads questions and answers for a given dataset and merges them"""
    questions = get_questions(ds)
    answers = get_answers(ds)

    questions = questions.merge(answers, on="id", how="left")
    questions["answer"] = questions.answer.fillna("")

    assert (questions.groupby("id").apply(len) == 1).all(), "Duplicate questions!"

    questions = questions.set_index("id")

    return questions


def preprocess_questions(ds):
    """Reads questions with answers for a dataset and preprocesses their texts"""
    print(f"- Preprocessing texts for {ds}")
    df = questions_with_answers(ds)

    df = map_pool(df, preprocess_texts)
    save(df, paths.corpus(ds, tokenized=False))

    df = map_pool(df, tokenize_texts)
    save(df, paths.corpus(ds, tokenized=True))


def main(datasets):
    print(f"Preprocessing texts.")
    for ds in datasets:
        preprocess_questions(ds)
    print(f"Finished preprocessing texts.")


if __name__ == "__main__":
    main(datasets)


Preprocessing texts.
- Preprocessing texts for gamedev_se
- Preprocessing texts for gamedev_so
- Preprocessing texts for so_samples/sample_0
- Preprocessing texts for so_samples/sample_1
- Preprocessing texts for so_samples/sample_2
- Preprocessing texts for so_samples/sample_3
- Preprocessing texts for so_samples/sample_4
Finished preprocessing texts.


In [11]:
import pandas as pd

from utils import paths, read, save
from utils.consts import datasets, split_percentage, noise_percentage


def extract_dup_pair_ids(ds):
    """Extracts the IDs of dup pairs in the dataset"""
    pairs = read(paths.dup_pairs(ds))
    corpus = read(paths.corpus(ds)).reset_index()

    print(f"- {len(pairs)} dup pairs")

    dups = corpus[corpus.id.isin(pairs.dup_id)]
    dups = dups[["id", "corpus_index"]]
    dups = dups.reset_index(drop=True)

    print(f"-- {len(dups)} dups")

    save(dups, paths.duplicate_question_ids(ds))

    main_qs = corpus[corpus.id.isin(pairs.main_id)]
    main_qs = main_qs[["id", "corpus_index"]]
    main_qs = main_qs.reset_index(drop=True)

    print(f"-- {len(main_qs)} main questions")

    save(main_qs, paths.main_question_ids(ds))


def extract_question_ids(ds):
    """Extracts question IDs for all questions in the dataset"""
    df = read(paths.corpus(ds))
    df = df.reset_index()

    print(f"- {len(df)} questions")

    answered_ids = df[df.n_answers > 0][["id", "corpus_index"]]
    answered_ids = answered_ids.reset_index(drop=True)
    save(answered_ids, paths.answered_question_ids(ds))

    print(f"- {len(answered_ids)} answered questions")

    df = df[["id", "corpus_index"]]
    save(df, paths.all_question_ids(ds))

    main_ids = read(paths.main_question_ids(ds))

    # Questions that will be compared = answered + main questions
    comp_ids = pd.concat([answered_ids, main_ids])
    comp_ids = comp_ids.reset_index(drop=True)
    comp_ids = comp_ids.drop_duplicates("id")

    save(comp_ids, paths.comparison_question_ids(ds))


def sample_noise_questions(ds, perc):
    """Samples a percentage of questions to serve as noise in supervised learning"""
    df = read(paths.corpus(ds))
    dups = read(paths.duplicate_question_ids(ds))
    mains = read(paths.main_question_ids(ds))

    df = df.reset_index()[["id", "corpus_index"]]
    # exclude true duplicates and their pairs
    df = df[~df.id.isin(dups.id) & ~df.id.isin(mains.id)]

    # sample a percentage of the number of duplicates
    n_dups = len(dups)
    noise_dups = round(n_dups * perc)
    noise = df.sample(noise_dups, random_state=42).reset_index(drop=True)

    save(noise, paths.noise_question_ids(ds))


def split_train_test_dups(ds, perc):
    """Randomly splits the duplicates that will be used for train and test sets"""
    dups = read(paths.duplicate_question_ids(ds))

    test_dups = dups.sample(frac=perc, random_state=42)
    test_dups = test_dups.reset_index(drop=True)

    save(test_dups, paths.test_dup_ids(ds))

    # remaining dups used for training
    train_dups = dups[~dups.id.isin(test_dups.id)]
    train_dups = train_dups.reset_index(drop=True)

    save(train_dups, paths.train_dup_ids(ds))

    print(f"- {len(train_dups)} dups in the train set")
    print(f"- {len(test_dups)} dups in the test set")


def extract_all_ids(ds, noise_p, split_p):
    """Extract all ids for a given dataset using the functions above"""
    print(f"Extracting IDs for {ds}.")
    extract_dup_pair_ids(ds)
    extract_question_ids(ds)
    sample_noise_questions(ds, noise_p)
    split_train_test_dups(ds, split_p)


def main(datasets, noise_p, split_p):
    print("Extracting IDs.")
    for ds in datasets:
        extract_all_ids(ds, noise_p, split_p)


if __name__ == "__main__":
    main(datasets, noise_percentage, split_percentage)


Extracting IDs.
Extracting IDs for gamedev_se.
- 1222 dup pairs
-- 1170 dups
-- 856 main questions
- 54684 questions
- 47101 answered questions
- 936 dups in the train set
- 234 dups in the test set
Extracting IDs for gamedev_so.
- 7 dup pairs
-- 7 dups
-- 6 main questions
- 906 questions
- 744 answered questions
- 6 dups in the train set
- 1 dups in the test set
Extracting IDs for so_samples/sample_0.
- 614 dup pairs
-- 602 dups
-- 488 main questions
- 27795 questions
- 23851 answered questions
- 482 dups in the train set
- 120 dups in the test set
Extracting IDs for so_samples/sample_1.
- 614 dup pairs
-- 600 dups
-- 481 main questions
- 27795 questions
- 23871 answered questions
- 480 dups in the train set
- 120 dups in the test set
Extracting IDs for so_samples/sample_2.
- 614 dup pairs
-- 602 dups
-- 468 main questions
- 27795 questions
- 23884 answered questions
- 482 dups in the train set
- 120 dups in the test set
Extracting IDs for so_samples/sample_3.
- 614 dup pairs
-- 599 d

In [12]:
import joblib
from scipy.sparse import save_npz, csr_matrix
from gensim.matutils import sparse2full
from gensim.corpora.dictionary import Dictionary
from sklearn.feature_extraction.text import TfidfVectorizer

from utils import paths, read, save, make_dir
from utils.consts import datasets, text_columns, features
from utils.models import (
    doc2vec_model,
    bm25_model,
    lda_model,
    bertoverflow_model,
    mpnet_model,
)


def train_bm25(ds, cols):
    """Trains and saves a BM25 model for each text column in the dataset"""
    print("- Training BM25 models")
    feature_name = "bm25"

    corpus = read(paths.corpus(ds))

    make_dir(paths.feature_model_dir(ds, feature_name))

    for c in cols:
        model_save_path = paths.feature_model(ds, feature_name, c)
        bm25 = bm25_model(corpus[c])
        bm25.save(model_save_path)


def train_doc2vec(ds, cols):
    """Trains and saves a Doc2Vec model for each text column in the dataset
    Also saves document embeddings learned by the models.
    """

    def train_doc2vec_from_file(c):
        """Trains a Doc2Vec model from a file containing the corpus (increased performance)"""
        training_corpus = paths.feature_model_dir(ds, feature_name) / (c + ".txt")

        # create a space-separated text file
        with open(training_corpus, "w") as f:
            for l in corpus[c].apply(lambda s: " ".join(s)):
                f.write(l + "\n")

        model = doc2vec_model(str(training_corpus))
        training_corpus.unlink()
        return model

    print("- Training Doc2Vec models")
    feature_name = "doc2vec"

    corpus = read(paths.corpus(ds))

    make_dir(paths.feature_model_dir(ds, feature_name))
    make_dir(paths.embedding_dir(ds, feature_name))

    for c in cols:
        model = train_doc2vec_from_file(c)
        emb = model.dv.vectors

        model_save_path = paths.feature_model(ds, feature_name, c)
        model.save(str(model_save_path))

        emb_save_path = paths.embedding(ds, feature_name, c)
        save_npz(emb_save_path, csr_matrix(emb))


def train_tfidf(ds, cols):
    """Trains and saves a TF-IDF model for each text column in the dataset
    Also saves document embeddings learned by the models.
    """
    print("- Training TF-IDF models")
    feature_name = "tfidf"

    corpus = read(paths.corpus(ds))

    make_dir(paths.feature_model_dir(ds, feature_name))
    make_dir(paths.embedding_dir(ds, feature_name))

    for c in cols:
        # tf-idf takes space separated strings
        corpus[c] = corpus[c].apply(lambda l: " ".join(l))
        tfidf = TfidfVectorizer().fit(corpus[c])
        emb = tfidf.transform(corpus[c])

        model_save_path = paths.feature_model(ds, feature_name, c)
        joblib.dump(tfidf, model_save_path)

        emb_save_path = paths.embedding(ds, feature_name, c)
        save_npz(emb_save_path, emb)


def train_lda(ds, cols):
    """Trains and saves an LDA model for each text column in the dataset
    Also saves document embeddings learned by the models.
    """

    def train_from_bow(c):
        """Trains an LDA model from a BoW + Vocab"""
        vocab = Dictionary(corpus[c])
        corpus[c] = corpus[c].apply(vocab.doc2bow)
        lda = lda_model(corpus[c], vocab)
        return lda

    print("- Training LDA models")

    feature_name = "topic"

    corpus = read(paths.corpus(ds))

    make_dir(paths.feature_model_dir(ds, feature_name))
    make_dir(paths.embedding_dir(ds, feature_name))

    for c in cols:
        lda = train_from_bow(c)
        emb = corpus[c].apply(lambda t: sparse2full(lda[t], 100))

        model_save_path = paths.feature_model(ds, feature_name, c)
        lda.save(str(model_save_path))

        emb_save_path = paths.embedding(ds, feature_name, c)
        save_npz(emb_save_path, csr_matrix(list(emb)))


def get_bertoverflow_embeddings(ds, cols, use_gpu=True):
    """Computes BERTOverflow embeddings and saves them"""
    print("- Computing BERTOverflow embeddings")

    feature_name = "bertoverflow"

    corpus = read(paths.corpus(ds, tokenized=False))

    model = bertoverflow_model()

    make_dir(paths.embedding_dir(ds, feature_name))

    device = None
    if not use_gpu:
        device = "cpu"

    for c in cols:
        print(f"-- Computing {c} embeddings with BERTOverflow for {ds}.")
        emb = model.encode(corpus[c], device=device, show_progress_bar=True)

        emb_save_path = paths.embedding(ds, feature_name, c)
        save_npz(emb_save_path, csr_matrix(emb))


def get_mpnet_embeddings(ds, cols, use_gpu=True):
    """Computes MPNet embeddings and saves them"""
    print("- Computing MPNet embeddings")
    feature_name = "mpnet"

    corpus = read(paths.corpus(ds, tokenized=False))

    model = mpnet_model()

    make_dir(paths.embedding_dir(ds, feature_name))

    device = None
    if not use_gpu:
        device = "cpu"

    for c in cols:
        print(f"-- Computing {c} embeddings with MPNet for {ds}.")
        emb = model.encode(corpus[c], device=device, show_progress_bar=True)

        emb_save_path = paths.embedding(ds, feature_name, c)
        save_npz(emb_save_path, csr_matrix(emb))


def train_all_models(ds, feats, cols):
    """Trains all feature models for the given dataset"""
    print(f"Training feature models for {ds}.")
    if "tfidf" in feats:
        train_tfidf(ds, cols)
    if "bm25" in feats:
        train_bm25(ds, cols)
    if "topic" in feats:
        train_lda(ds, cols)
    if "doc2vec" in feats:
        train_doc2vec(ds, cols)
    if "bertoverflow" in feats:
        get_bertoverflow_embeddings(ds, cols)
    if "mpnet" in feats:
        get_mpnet_embeddings(ds, cols)


def main(datasets, feats, cols):
    print(f"Training feature models")
    for ds in datasets:
        train_all_models(ds, feats, cols)


if __name__ == "__main__":
    main(datasets, features, text_columns)

Training feature models
Training feature models for gamedev_se.
- Training TF-IDF models
- Training BM25 models
- Training LDA models
- Training Doc2Vec models
Training feature models for gamedev_so.
- Training TF-IDF models
- Training BM25 models
- Training LDA models
- Training Doc2Vec models
Training feature models for so_samples/sample_0.
- Training TF-IDF models
- Training BM25 models
- Training LDA models
- Training Doc2Vec models
Training feature models for so_samples/sample_1.
- Training TF-IDF models
- Training BM25 models
- Training LDA models
- Training Doc2Vec models
Training feature models for so_samples/sample_2.
- Training TF-IDF models
- Training BM25 models
- Training LDA models
- Training Doc2Vec models
Training feature models for so_samples/sample_3.
- Training TF-IDF models
- Training BM25 models
- Training LDA models
- Training Doc2Vec models
Training feature models for so_samples/sample_4.
- Training TF-IDF models
- Training BM25 models
- Training LDA models
- Tra

In [None]:
import numpy as np
import pandas as pd
from multiprocessing import Pool

from utils import paths, read, save, QuestionComp
from utils.consts import datasets, features, text_columns, n_procs


def score_dup_pairs(dups, ds, f, c):
    """Scores a set of duplicates against all other answered/main questions
    in the dataset for a given similarity (feature + question part)
    """

    def merge_scores(df, qs, scores):
        """Adds similarity scores to the dataframe of questions"""
        dup_id, dup_index = df.name
        qs = qs.copy()
        qs["score"] = scores[dup_index]
        qs = qs.drop(columns="corpus_index")
        return qs

    comp_qs = read(paths.comparison_question_ids(ds))[["id", "corpus_index"]]
    qc = QuestionComp(ds, f, c)

    scores = qc.compare(dups["corpus_index"], comp_qs["corpus_index"])

    # replace corpus index with dup index to select correct set of scores
    dups = dups.drop(columns="corpus_index").reset_index(drop=True)
    dups = dups.reset_index()
    dups = dups.rename(columns={"id": "dup_id"})

    merge_dup_scores = lambda df: merge_scores(df, comp_qs, scores)

    scores = dups.groupby(["dup_id", "index"]).apply(merge_dup_scores)
    scores = scores.reset_index()
    scores = scores.drop(columns=["index"])

    return scores


def calculate_scores(ds, f, c):
    """Scores all duplicates against all other answered/main questions
    in the dataset for a given similarity (feature + question part)
    Uses multiprocessing + the score_dup_pairs function
    """
    dups = read(paths.duplicate_question_ids(ds))

    dups = np.array_split(dups, n_procs)
    tups = [(d, ds, f, c) for d in dups]

    with Pool(n_procs) as p:
        dups = p.starmap(score_dup_pairs, tups)

    dups = pd.concat(dups)

    return dups


def rank_dup_pairs(ds, f, c):
    """Ranks dup pairs against all other question pairs
    in the dataset based on the similarity scores for a
    feature + column
    """
    scores = calculate_scores(ds, f, c)

    scores = scores.rename(columns={"id": "main_id"})

    # removes pairs of the same question
    scores = scores[scores.main_id != scores.dup_id]

    pairs = read(paths.dup_pairs(ds))
    pairs["is_dup"] = True

    scores = scores.merge(pairs, on=["dup_id", "main_id"], how="left")
    scores["is_dup"] = scores.is_dup.fillna(False)

    # get rank
    scores["rank"] = scores.groupby("dup_id").score.rank(ascending=False)

    scores = scores[scores.is_dup]
    scores = scores.reset_index(drop=True)
    scores = scores.drop(columns=["is_dup", "level_2"])

    save(scores, paths.pair_ranks(ds, f, c))


def calculate_recall_rates(ds, feats, cols):
    """Calculate recall-rates@k based on the dup pairs ranks for a given dataset"""

    def recall_rate(df, k):
        """Calculates the recall-rate@k for a dataset using the rank column"""
        # rank <= k -> dup pair in top k results
        has_dup_in_k = lambda r: (r <= k).any()
        return df.groupby("dup_id")["rank"].apply(has_dup_in_k).mean()

    recall_rates = []

    for f in feats:
        for c in cols:
            scores = read(paths.pair_ranks(ds, f, c))

            rates = {
                "feature": f,
                "col": c,
            }

            for i in [5, 10, 20]:
                rates[f"recall-rate@{i}"] = recall_rate(scores, i)

            recall_rates.append(rates)

    recall_rates = pd.DataFrame(recall_rates)
    save(recall_rates, paths.all_pair_ranks(ds))


def dup_pair_ranks(ds, feats, columns):
    """Calculates the ranks of dup pais for all similarity scores in the dataset"""
    print(f"Ranking pair ranks for {ds}")
    res = []
    for f in feats:
        print(f"- Computing pair ranks for {f}")
        for c in columns:
            rank_dup_pairs(ds, f, c)


def main(datasets, feats, columns):
    for ds in datasets:
        dup_pair_ranks(ds, feats, columns)
        calculate_recall_rates(ds, feats, columns)


if __name__ == "__main__":
    main(datasets, features, text_columns)


Ranking pair ranks for gamedev_se
- Computing pair ranks for jaccard
- Computing pair ranks for tfidf
- Computing pair ranks for bm25
- Computing pair ranks for topic
- Computing pair ranks for doc2vec
Ranking pair ranks for gamedev_so
- Computing pair ranks for jaccard
- Computing pair ranks for tfidf
- Computing pair ranks for bm25
- Computing pair ranks for topic
- Computing pair ranks for doc2vec
Ranking pair ranks for so_samples/sample_0
- Computing pair ranks for jaccard
- Computing pair ranks for tfidf
- Computing pair ranks for bm25
- Computing pair ranks for topic
- Computing pair ranks for doc2vec
Ranking pair ranks for so_samples/sample_1
- Computing pair ranks for jaccard
- Computing pair ranks for tfidf
- Computing pair ranks for bm25
- Computing pair ranks for topic


In [1]:
import numpy as np
import pandas as pd

from utils import paths, read, save, QuestionComp
from utils.consts import datasets, n_candidates, undersampling_percentages


def select_candidates(ds, dups, comp, n):
    """Selects n candidate questions for each dup from the set of provided questions"""

    def dup_cands(df, ids, scores, n):
        """Selects the n candidates with the highest score for the dup in the df"""
        dup_id, _, dup_index = df.name
        ids = ids.copy()
        ids["score"] = scores[dup_index]
        # remove pairs with same questions
        ids = ids[ids.candidate_id != dup_id].copy()
        ids = ids.sort_values("score", ascending=False)[:n]
        return ids

    def get_scores(ds, dups, comp):
        """Gets the scores according to a pre-defined similarity to use
        when selecting candidate questions
        """
        qc = QuestionComp(ds, "tfidf", "title_body_tags_answer")
        return qc.compare(dups["corpus_index"], comp["corpus_index"])

    scores = get_scores(ds, dups, comp)

    dups = dups.rename(columns={"id": "dup_id", "corpus_index": "dup_corpus_index"})

    # get index of the duplicate in the df
    dups = dups.reset_index()

    comp = comp.rename(
        columns={"id": "candidate_id", "corpus_index": "candidate_corpus_index"}
    )

    get_cands = lambda df: dup_cands(df, comp, scores, n)
    candidates = dups.groupby(["dup_id", "dup_corpus_index", "index"]).apply(get_cands)

    # fix index
    candidates = candidates.reset_index(level="index", drop=True)
    candidates = candidates.reset_index()
    candidates = candidates.drop(columns="level_2")

    same_ids = candidates[candidates.dup_id == candidates.candidate_id]
    assert len(same_ids) == 0, "Some dup IDs are the same as their related IDs!"
    assert (
        not candidates[["dup_id", "candidate_id"]].duplicated().any()
    ), "Duplicated candidates!"

    return candidates


def add_dup_labels(cands, pairs):
    """Adds labels indicating if candidate pairs are duplicates or not"""
    pairs["is_dup"] = True
    cands = cands.merge(
        pairs[["dup_id", "main_id", "is_dup"]],
        left_on=["dup_id", "candidate_id"],
        right_on=["dup_id", "main_id"],
        how="left",
    )
    cands = cands.drop(columns="main_id")
    cands["is_dup"] = cands.is_dup.fillna(False)
    return cands


def add_noise_labels(cands, noise):
    """Adds labels indicating if candidate pairs are noise or not"""
    noise["is_noise"] = True
    noise = noise.rename(columns={"id": "dup_id"})
    cands = cands.merge(noise[["dup_id", "is_noise"]], on="dup_id", how="left")
    cands["is_noise"] = cands.is_noise.fillna(False)
    return cands


def undersample(candidates, perc):
    """Undersamples the set of candidate pairs to achieve a desired percentage
    of true duplicate pairs
    """

    def samples_per_dup():
        """Calculates how many false candidates pairs
        we have to sample for each duplicate question
        """
        neg_dups = negatives["dup_id"].unique()

        # the number of negatives for each positive sample
        negs_per_pos = round(1 / perc - 1)

        # total number of negatives and positives
        n_negs = negs_per_pos * len(positives)
        n_dups = len(neg_dups)

        # lower bound of negatives per duplicate question
        lower_n = n_negs // n_dups

        # adds one additional false sample for each dup
        # until there is no remainder left (n_negs % n_dups)
        # guarantees that we will have exactly n_negs
        # and that dups will have similar numbers of pairs
        return {d: lower_n + int(i < n_negs % n_dups) for i, d in enumerate(neg_dups)}

    def select_samples(dup_df):
        """Selects a number of samples for the duplicate question
        in the df according to the values obtained by samples_per_dup
        """
        return dup_df.sample(dup_samples[dup_df.name], random_state=42)

    positives = candidates[candidates.is_dup]
    negatives = candidates[~candidates.is_dup]
    dup_samples = samples_per_dup()

    negatives = negatives.groupby("dup_id").apply(select_samples).reset_index(drop=True)

    return pd.concat([positives, negatives]).reset_index(drop=True)


def decrease_candidates(candidates, n):
    """Reduces the number of candidate pairs to n
    This function is useful for selecting candidates only once for a large N
    and then reducing the size if we need smaller Ns
    """
    limit_cands = lambda df: df.sort_values("score", ascending=False)[:n]
    return candidates.groupby("dup_id").apply(limit_cands).reset_index(drop=True)


def select_train_candidates_split(ds, dups, n, undersample_perc=None):
    """Selects n train candidates pairs for the given dups of the dataset
    in chunks to save memory space
    For datasets with large numbers of candidates it is easy to
    fill up memory space
    """
    # select candidates for each chunk
    splits = 10
    dups = np.array_split(dups, splits)

    for i, ds in enumerate(dups):
        print(i, end="\r")
        ds = ds.reset_index(drop=True)
        select_train_candidates_single(ds, ds, n, undersample_perc, i)

    # merge candidates sampled above
    candidates = []
    for i in range(splits):
        path = paths.train_candidate_pairs(ds, n, undersample_perc, i)
        candidates.append(read(path))
        path.unlink()

    candidates = pd.concat(candidates).reset_index(drop=True)

    save(candidates, paths.train_candidate_pairs(ds, n, undersample_perc))


def select_train_candidates_single(ds, dups, n, undersample_perc=None, i=None):
    """Selects n train candidates pairs for the given dups of the dataset
    the i parameter allows for saving chunks of candidates separately
    """
    test = read(paths.test_dup_ids(ds))
    comp = read(paths.comparison_question_ids(ds))
    # remove test dups from the comparison questions
    # to avoid leakage (comparing train dups with test dups)
    comp = comp[~comp.id.isin(test.id)]

    candidates = select_candidates(ds, dups, comp, n)

    dup_pairs = read(paths.dup_pairs(ds))
    candidates = add_dup_labels(candidates, dup_pairs)

    noise = read(paths.noise_question_ids(ds))
    candidates = add_noise_labels(candidates, noise)

    if undersample_perc is not None:
        candidates = undersample(candidates, undersample_perc)

    save(candidates, paths.train_candidate_pairs(ds, n, undersample_perc, i))


def select_train_candidates(ds, n, undersample_perc=None):
    """Selects n train candidates for all of the train dups and noise questions in the dataset"""
    train = read(paths.train_dup_ids(ds))
    # noise is added to avoid bias
    noise = read(paths.noise_question_ids(ds))

    dups = pd.concat([train, noise]).reset_index(drop=True)

    print("- Selecting train candidates")

    # for datasets with >= 5000 dups, we split them to save memory space
    if len(dups) < 5000:
        select_train_candidates_single(ds, dups, n, undersample_perc)
    else:
        select_train_candidates_split(ds, dups, n, undersample_perc)


def select_train_candidates_multi(ds, n_candidates, percentages):
    """Selects train candidate pairs for a given dataset
    using different values of n and undersampling percentages
    """
    # we only need to sample the max number of candidates
    # then we can use the decrease_candidates function
    # to limit the number of candidates
    max_candidates = max(n_candidates)
    select_train_candidates(ds, max_candidates)

    candidates = read(paths.train_candidate_pairs(ds, max_candidates))

    print("-- Making train sets")
    for c in n_candidates:
        reduced = decrease_candidates(candidates, c)
        for p in percentages:
            print(f"--- {c}, {p}")
            # undersample the dataset only if the percentage of
            # duplicates is smaller than the undersampling percentage
            if reduced.is_dup.mean() < p:
                sampled = undersample(reduced, p)
            else:
                sampled = reduced

            save(sampled, paths.train_candidate_pairs(ds, c, p))


def select_test_candidates(ds, n):
    """Selects n test candidates for all of the test dups in the dataset"""
    dups = read(paths.test_dup_ids(ds))
    comp = read(paths.comparison_question_ids(ds))

    dup_pairs = read(paths.dup_pairs(ds))

    print("- Selecting test candidates")

    candidates = select_candidates(ds, dups, comp, n)
    candidates = add_dup_labels(candidates, dup_pairs)
    # no noise in the test
    candidates["is_noise"] = False

    save(candidates, paths.test_candidate_pairs(ds))


def select_test_candidates_multi(ds, n_candidates):
    """Selects test candidate pairs for a given dataset
    using different values of n
    """
    # we only need to sample the max number of candidates
    # we can later limit the number of candidates used
    # during the evaluation of the classifiers
    max_candidates = max(n_candidates)
    select_test_candidates(ds, max_candidates)


def merge_candidates(ds, candidates, percentages):
    """Merges all of the sets of candidate pairs into a single dataset
    This way we avoid having to compute and compare features multiple
    times for candidate pairs that appear in many sets of candidates
    """
    print("- Merging all sets of candidates")
    candidate_sets = [read(paths.test_candidate_pairs(ds))]
    for p in percentages:
        for c in candidates:
            candidate_sets.append(read(paths.train_candidate_pairs(ds, c, p)))

    candidate_sets = pd.concat(candidate_sets)
    candidate_sets = candidate_sets.drop_duplicates()
    candidate_sets = candidate_sets.reset_index(drop=True)

    save(candidate_sets, paths.candidate_pairs(ds))


def select_candidates_multi(ds, ns, ps):
    """Selects candidate sets for multiple ns and undersampling percentages"""
    print(f"Selecting candidates for {ds}.")
    select_train_candidates_multi(ds, ns, ps)
    select_test_candidates_multi(ds, ns)
    merge_candidates(ds, ns, ps)


def select_candidates_single(ds, n, p):
    """Selects candidate sets for a single value of n and undersampling percentage"""
    print(f"Selecting candidates for {ds}.")
    select_train_candidates(ds, n, p)
    select_test_candidates(ds, n)
    merge_candidates(ds, [n], [p])


def main(datasets, ns, ps):
    # we select multiple candidate sets for the datasets
    # for our analysis
    for ds in datasets:
        select_candidates_multi(ds, ns, ps)


if __name__ == "__main__":
    main(datasets, n_candidates, undersampling_percentages)


Selecting candidates for gamedev_se.
- Selecting train candidates
-- Making train sets
--- 1500, 0.01
- Selecting test candidates
- Merging all sets of candidates
Selecting candidates for gamedev_so.
- Selecting train candidates
-- Making train sets
--- 1500, 0.01
- Selecting test candidates
- Merging all sets of candidates
Selecting candidates for so_samples/sample_0.
- Selecting train candidates
-- Making train sets
--- 1500, 0.01
- Selecting test candidates
- Merging all sets of candidates
Selecting candidates for so_samples/sample_1.
- Selecting train candidates
-- Making train sets
--- 1500, 0.01
- Selecting test candidates
- Merging all sets of candidates
Selecting candidates for so_samples/sample_2.
- Selecting train candidates
-- Making train sets
--- 1500, 0.01
- Selecting test candidates
- Merging all sets of candidates
Selecting candidates for so_samples/sample_3.
- Selecting train candidates
-- Making train sets
--- 1500, 0.01
- Selecting test candidates
- Merging all sets 

In [1]:
import numpy as np
import pandas as pd
from multiprocessing import Pool

from utils import paths, read, save, QuestionComp
from utils.consts import datasets, n_procs, features, text_columns


def save_features(df, ds, feature_name, proc_num):
    """Saves only the relevant columns for a set of features"""

    def cols_to_save(df):
        return ["dup_id", "candidate_id"] + [c for c in df.columns if "_sim" in c]

    save_path = paths.feature(ds, feature_name, proc_num)
    cols = cols_to_save(df)
    save(df[cols], save_path)


def calc_feature(ds, feature, cols, proc_num=None):
    """Calculates the feature values (similarity scores) for candidates from a given dataset
    proc_num serves to select a chunk of candidates as opposed to all of them
    """

    def compare_pairs(df, qc, f, c):
        dup_id, dup_index = df.name
        cand_indexes = df["candidate_corpus_index"]
        df[f"{c}_{f}_sim"] = qc.compare(dup_index, cand_indexes)
        return df

    candidates = read(paths.candidate_pairs(ds, proc_num))

    for c in cols:
        qc = QuestionComp(ds, feature, c)
        f = lambda df: compare_pairs(df, qc, feature, c)
        candidates = candidates.groupby(["dup_id", "dup_corpus_index"]).apply(f)
        candidates = candidates.reset_index(drop=True)

    save_features(candidates, ds, feature, proc_num)


def split_candidates(ds, n_procs):
    """Splits candidates into chunks to allow for multiprocessing
    during feature calculation
    """
    candidates = read(paths.candidate_pairs(ds))

    dups = candidates["dup_id"].unique()
    np.random.shuffle(dups)
    split_dups = np.array_split(dups, n_procs)

    for i, c in enumerate(split_dups):
        df = candidates[candidates.dup_id.isin(c)]
        save(df, paths.candidate_pairs(ds, i))


def merge_datasets(ds, n_procs, feats):
    """Merges the chunks of feature dataframes into a single dataframe
    for each feature
    """
    for f in feats:
        # merge and save
        fs = [read(paths.feature(ds, f, i)) for i in range(n_procs)]
        df_feat = pd.concat(fs).reset_index(drop=True)
        save(df_feat, paths.feature(ds, f))

        # delete chunks
        for i in range(n_procs):
            paths.feature(ds, f, i).unlink()
        # delete the empty dir
        paths.feature(ds, f, 0).parent.rmdir()

    # delete candidate chunks
    for i in range(n_procs):
        paths.candidate_pairs(ds, i).unlink()
    # delete the empty dir
    paths.candidate_pairs(ds, 0).parent.rmdir()


def calc_features(ds, feats, cols):
    """Calculates all of the features for a given dataset using multiprocessing"""
    print(f"Started computing the features for {ds}.")
    split_candidates(ds, n_procs)

    for f in feats:
        print(f"- Computing {f} features for {ds}.")
        params = [(ds, f, cols, i) for i in range(n_procs)]

        with Pool(n_procs) as p:
            p.starmap(calc_feature, params)

    merge_datasets(ds, n_procs, feats)


def main(datasets, feats, cols):
    for ds in datasets:
        calc_features(ds, feats, cols)


if __name__ == "__main__":
    main(datasets, features, text_columns)


Started computing the features for gamedev_se.
- Computing jaccard features for gamedev_se.
- Computing tfidf features for gamedev_se.
- Computing bm25 features for gamedev_se.
- Computing topic features for gamedev_se.
- Computing doc2vec features for gamedev_se.
Started computing the features for gamedev_so.
- Computing jaccard features for gamedev_so.
- Computing tfidf features for gamedev_so.
- Computing bm25 features for gamedev_so.
- Computing topic features for gamedev_so.
- Computing doc2vec features for gamedev_so.
Started computing the features for so_samples/sample_0.
- Computing jaccard features for so_samples/sample_0.
- Computing tfidf features for so_samples/sample_0.
- Computing bm25 features for so_samples/sample_0.
- Computing topic features for so_samples/sample_0.
- Computing doc2vec features for so_samples/sample_0.
Started computing the features for so_samples/sample_1.
- Computing jaccard features for so_samples/sample_1.
- Computing tfidf features for so_samples

In [2]:
import pandas as pd
import numpy as np

from utils import paths, read, save
from utils.consts import features, datasets, n_candidates, undersampling_percentages


def check_features(feats, cands):
    """Checks if the candidates dataframe is the same as the features
    dataframe after merging (e.g., if there are no repeated entries)
    """
    same_size = len(cands) == len(feats)
    same_dups = set(cands.dup_id) == set(feats.dup_id)
    same_rels = set(cands.candidate_id) == set(feats.candidate_id)

    return same_size and same_dups and same_rels


def make_test(ds, feats):
    """Creates a test dataset by joining features to the test candidate pairs"""
    cands = read(paths.test_candidate_pairs(ds))
    test = merge_features(ds, cands.copy(), feats)

    assert check_features(
        test, cands
    ), "The test dataset is different from its candidates!"
    assert not test.isna().any().any(), "The test dataset has NaNs!"

    save(test, paths.test_set(ds))


def make_train(ds, n, p, feats):
    """Creates a train dataset by joining features to one of the train candidate pairs set"""
    cands = read(paths.train_candidate_pairs(ds, n, p))
    train = merge_features(ds, cands.copy(), feats)

    assert check_features(
        train, cands
    ), f"The train dataset ({n}, {p}) is different from its candidates!"
    assert not train.isna().any().any(), f"The train dataset ({n}, {p}) has NaNs!"

    save(train, paths.train_set(ds, n, p))


def merge_features(ds, cands, feats):
    """Merges the features with the candidate pairs for a give dataset"""
    # only keeps the relevant columns for the feature set
    # score will later be used to truncate the test set and allow for
    # evaluation on a different number of candidates
    cands = cands[["dup_id", "candidate_id", "score", "is_dup"]]
    for f in feats:
        df_feat = read(paths.feature(ds, f))
        cands = cands.merge(df_feat, on=["candidate_id", "dup_id"], how="left")
    return cands


def make_train_multi(ds, ns, ps, feats):
    """Creates train sets of features for all combinations of candidate pairs and
    undersampling percentages
    """
    for n in ns:
        for p in ps:
            print(f"- Making train set ({n}, {p})")
            make_train(ds, n, p, feats)


def make_sets_multi(datasets, ns, ps, feats):
    """Creates train and test sets of features for all combinations of candidate pairs and
    undersampling percentages for the given datasets
    """
    for ds in datasets:
        print(f"Creating train and test sets for {ds}")
        make_train_multi(ds, ns, ps, feats)
        make_test(ds, feats)


def make_sets(datasets, n, p, feats):
    """Creates train and test sets of features for one value of candidate pairs and
    undersampling percentages for the given datasets
    """
    for ds in datasets:
        print(f"Creating train and test sets for {ds}")
        make_train(ds, n, p, feats)
        make_test(ds, feats)


if __name__ == "__main__":
    make_sets_multi(datasets, n_candidates, undersampling_percentages, features)


Creating train and test sets for gamedev_se
- Making train set (1500, 0.01)
Creating train and test sets for gamedev_so
- Making train set (1500, 0.01)
Creating train and test sets for so_samples/sample_0
- Making train set (1500, 0.01)
Creating train and test sets for so_samples/sample_1
- Making train set (1500, 0.01)
Creating train and test sets for so_samples/sample_2
- Making train set (1500, 0.01)
Creating train and test sets for so_samples/sample_3
- Making train set (1500, 0.01)
Creating train and test sets for so_samples/sample_4
- Making train set (1500, 0.01)


In [3]:
import numpy as np
import pandas as pd

from utils import paths, read, save
from utils.models.hp_tuning import tune_train_set
from utils.consts import datasets, n_candidates, undersampling_percentages


def random_forest_search(ds, c, p):
    """Tunes hyperparameters for the train set with c candidates and p undersampling percentage
    for the given dataset using the provided hyperparameter tuning parameters
    """
    train = read(paths.train_set(ds, c, p))
    results = tune_train_set(train, 5)
    results = pd.DataFrame(results.cv_results_)
    save(results, paths.cv_results(ds, c, p))


def tune_multiple_sets(datasets, ns, ps):
    for ds in datasets:
        print(f"Tuning hyperparameters for {ds}")
        for p in ps:
            for n in ns:
                print(
                    f"- Tuning HPs for the train set with {n} candidates, {p} percent dups"
                )
                rf = random_forest_search(ds, n, p)


if __name__ == "__main__":
    tune_multiple_sets(datasets, n_candidates, undersampling_percentages)


Tuning hyperparameters for gamedev_se
- Tuning HPs for the train set with 1500 candidates, 0.01 percent dups
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Tuning hyperparameters for gamedev_so
- Tuning HPs for the train set with 1500 candidates, 0.01 percent dups
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Tuning hyperparameters for so_samples/sample_0
- Tuning HPs for the train set with 1500 candidates, 0.01 percent dups
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Tuning hyperparameters for so_samples/sample_1
- Tuning HPs for the train set with 1500 candidates, 0.01 percent dups
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Tuning hyperparameters for so_samples/sample_2
- Tuning HPs for the train set with 1500 candidates, 0.01 percent dups
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Tuning hyperparameters for so_samples/sample_3
- Tuning HPs for the train set with 1500 candidates, 0.01 percent dups
Fitting 5 folds

In [4]:
import json
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

from utils import paths, read, save, make_dir
from utils.models import get_X_y
from utils.models.scoring import multiple_k_scorer, predict_probabilities
from utils.consts import (
    datasets,
    n_procs,
    n_candidates,
    undersampling_percentages,
    best_candidates,
    best_undersampling,
)


def best_classifier(ds, c, p):
    """Reads the HP tuning results and returns the classifier
    that achieved the highest score
    """
    df = read(paths.cv_results(ds, c, p))
    # manual analysis suggests that rr@5 was a good metric
    # for choosing the best classifier
    params = df[df["rank_test_rr@5"] == 1].iloc[0].params

    # the dict objects in the dataframe
    # are sometimes converted to json format
    if type(params) == bytes:
        params = json.loads(params)

    rf = RandomForestClassifier(n_jobs=n_procs, **params)
    return rf


def train_best_classifier(ds, c, p):
    """Fits the best classifier from HP tuning on the train set
    for the given dataset and saves it
    """
    rf = best_classifier(ds, c, p)
    train = read(paths.train_set(ds, c, p))
    X, y = get_X_y(train)
    rf = rf.fit(X, y)

    make_dir(paths.classifiers_dir(ds))
    joblib.dump(rf, paths.classifier(ds, c, p))

    
def limit_candidates(df, c):
    df = df.groupby("dup_id").apply(
        lambda x: x.sort_values("score", ascending=False)[:c]
    )
    return df.reset_index(drop=True)

def score(ds, c, p, test_on=None):
    """Scores a given dataset by training on its train set and
    evaluating on the test set.
    Allows for using other datasets for evaluation using the test_on param
    """
    if test_on is None:
        dataset_test = ds
    else:
        dataset_test = test_on

    rf = joblib.load(paths.classifier(ds, c, p))

    test = read(paths.test_set(dataset_test))
    test = limit_candidates(test, c)
    X, y = get_X_y(test)

    scores = multiple_k_scorer(rf, X, y)

    scores["dataset_train"] = ds
    scores["candidates"] = c

    if test_on is not None:
        scores["dataset_test"] = test_on

    return scores


def candidate_performance(datasets, ns, p):
    """Tests every dataset on its own test set using
    different numbers of candidates and saves a summary dataset"""
    df = []
    for ds in datasets:
        print(f"Evaluating classifiers on different candidates for {ds}")
        for n in ns:
            print(f"- {n} candidates")
            df.append(score(ds, n, p))

    df = pd.DataFrame(df)
    save(df, paths.candidates_evaluation())


def cross_dataset_performance(datasets, n, p):
    """Evaluates the performance of classifiers trained on one dataset in using
    the others as evaluation"""
    df = []

    print(f"Evaluating classifiers cross-datasets")

    for ds1 in datasets:
        for ds2 in datasets:
            print(f"- Training on: {ds1}; testing on: {ds2}")
            df.append(score(ds1, n, p, test_on=ds2))
    
    df = pd.DataFrame(df)
    df = df[["dataset_train", "dataset_test", "rr@5", "rr@10", "rr@20"]]
    df = df.sort_values('dataset_test')

    save(df, paths.cross_dataset_performance())
    
def misclassified_dups(ds, c, p):
    """Makes a dataset of the duplicate questions that do not have their main question
    in the 20 top ranked pairs for the dataset"""
    def select_misclassified(df, dups):
        """Selects the true pair and the top ranked pair for each misclassified duplicate"""
        dup = df.name
        df = df.sort_values('pred', ascending=False)
        if df[:20]['is_dup'].any():
            # correctly classified = nothing to return
            res = pd.DataFrame()
        else:
            # if the dup is in the list of candidates at all
            has_dup = df.is_dup.any()
            
            if has_dup:
                true_dup = df[df.is_dup].iloc[0]['candidate_id']
            else:
                true_dup = dups[dups.dup_id == dup].iloc[0]['main_id']
                
            top_ranked = df[~df.is_dup].iloc[0]['candidate_id']
            
            res = pd.DataFrame([{'main_id': true_dup, 'top_ranked': top_ranked, 'has_dup': has_dup}])
        return res

    print(f'- Selecting misclassified duplicates for {ds}')
        
    rf = joblib.load(paths.classifier(ds, c, p))
    
    test = read(paths.test_set(ds))
    test = limit_candidates(test, c)
    X, y = get_X_y(test)
    
    test = predict_probabilities(rf, X, y)
    
    dup_pairs = read(paths.dup_pairs(ds))
    
    missed = test.groupby('dup_id').apply(lambda df: select_misclassified(df, dup_pairs))

    if len(missed) > 0:
        missed = missed.reset_index()
        missed['has_dup'] = missed.has_dup.apply(bool)
        
        missed = missed.drop(columns=['level_1'])
        
        # adds URLs for easier analysis
        if ds == 'gamedev_se':
            to_url = lambda i: f'https://gamedev.stackexchange.com/questions/{i}/'
        else:
            to_url = lambda i: f'https://stackoverflow.com/questions/{i}/'
        
        for c in [c for c in missed.columns if c != 'has_dup']:
            missed[c + '_url'] = missed[c].apply(to_url)
        
        if 'gamedev' not in ds:
            ds = 'so_sample'
        
    save(missed, paths.misclassified_duplicates(ds))


def train_classifiers(ds, ns, ps):
    """Trains classifiers for all combinations of candidates and undersampling
    percentages
    """
    print(f"Training classifiers for {ds}.")
    for n in ns:
        for p in ps:
            print(f"- Training classifiers for {n} candidates")
            train_best_classifier(ds, n, p)


def main(datasets, ns, ps, best_n, best_p):
    for ds in datasets:
        train_classifiers(ds, ns, ps)

    for ds in ['gamedev_se', 'gamedev_so', 'so_samples/sample_0']:
        misclassified_dups(ds, best_n, best_p)

    candidate_performance(datasets, ns, best_p)
    cross_dataset_performance(datasets, best_n, best_p)


if __name__ == "__main__":
    main(
        datasets,
        n_candidates,
        undersampling_percentages,
        best_candidates,
        best_undersampling,
    )


Training classifiers for gamedev_se.
- Training classifiers for 1500 candidates
Training classifiers for gamedev_so.
- Training classifiers for 1500 candidates
Training classifiers for so_samples/sample_0.
- Training classifiers for 1500 candidates
Training classifiers for so_samples/sample_1.
- Training classifiers for 1500 candidates
Training classifiers for so_samples/sample_2.
- Training classifiers for 1500 candidates
Training classifiers for so_samples/sample_3.
- Training classifiers for 1500 candidates
Training classifiers for so_samples/sample_4.
- Training classifiers for 1500 candidates
[CV] END bootstrap=False, class_weight=None, max_depth=8, min_samples_leaf=2, min_samples_split=10, n_estimators=205, random_state=42; total time=  46.2s
[CV] END bootstrap=False, class_weight=None, max_depth=8, min_samples_leaf=2, min_samples_split=10, n_estimators=205, random_state=42; total time=   0.2s
[CV] END bootstrap=False, class_weight=None, max_depth=8, min_samples_leaf=2, min_sampl

- Training on: so_samples/sample_1; testing on: so_samples/sample_1
- Training on: so_samples/sample_1; testing on: so_samples/sample_2
- Training on: so_samples/sample_1; testing on: so_samples/sample_3
- Training on: so_samples/sample_1; testing on: so_samples/sample_4
- Training on: so_samples/sample_2; testing on: gamedev_se
- Training on: so_samples/sample_2; testing on: gamedev_so
- Training on: so_samples/sample_2; testing on: so_samples/sample_0
- Training on: so_samples/sample_2; testing on: so_samples/sample_1
- Training on: so_samples/sample_2; testing on: so_samples/sample_2
- Training on: so_samples/sample_2; testing on: so_samples/sample_3
- Training on: so_samples/sample_2; testing on: so_samples/sample_4
- Training on: so_samples/sample_3; testing on: gamedev_se
- Training on: so_samples/sample_3; testing on: gamedev_so
- Training on: so_samples/sample_3; testing on: so_samples/sample_0
- Training on: so_samples/sample_3; testing on: so_samples/sample_1
- Training on: s

In [5]:
import os
from pathlib import Path


def root_dir():
    path = os.path.abspath(os.getcwd())
    root = path.rsplit("code", 1)[0]
    return Path(root)


def percent_to_string(p):
    p = p * 100
    if p > 0:
        p = int(p)
    return str(p).replace(".", "_")


####################
## Data dirs      ##
####################


def data_dir():
    return root_dir() / "data"


def dataset_dir(ds):
    return data_dir() / ds


# Raw data


def raw_dir(ds):
    return dataset_dir(ds) / "raw"


def posts_xml(ds, i=None):
    return raw_dir(ds) / "Posts.xml"


def post_links_xml(ds):
    return raw_dir(ds) / "PostLinks.xml"


def questions_xml(ds, i=None):
    if i is None:
        return raw_dir(ds) / "questions.xml"
    else:
        return raw_dir(ds) / f"questions_{i}.xml"


def answers_xml(ds, i=None):
    if i is None:
        return raw_dir(ds) / "answers.xml"
    else:
        return raw_dir(ds) / f"answers_{i}.xml"


# Question IDs


def ids_dir(ds):
    return dataset_dir(ds) / "question_ids"


def all_question_ids(ds):
    return ids_dir(ds) / "all_question_ids.parquet"


def accepted_answer_ids(ds):
    return ids_dir(ds) / "accepted_answers.parquet"


def dup_pairs(ds):
    return ids_dir(ds) / "dup_pairs.parquet"


def duplicate_question_ids(ds):
    return ids_dir(ds) / "duplicate_question_ids.parquet"


def main_question_ids(ds):
    return ids_dir(ds) / "main_question_ids.parquet"


def noise_question_ids(ds):
    return ids_dir(ds) / "noise_question_ids.parquet"


def comparison_question_ids(ds):
    return ids_dir(ds) / "comparison_question_ids.parquet"


def answered_question_ids(ds):
    return ids_dir(ds) / "answered_question_ids.parquet"


def train_dup_ids(ds):
    return ids_dir(ds) / "train_dup_ids.parquet"


def test_dup_ids(ds):
    return ids_dir(ds) / "test_dup_ids.parquet"


# Corpus


def corpus_dir(ds):
    return dataset_dir(ds) / "corpus"


def question_texts(ds, i=None):
    if i is None:
        return corpus_dir(ds) / "question_texts.parquet"
    else:
        return corpus_dir(ds) / f"question_texts_{i}.parquet"


def answer_texts(ds, i=None):
    if i is None:
        return corpus_dir(ds) / "answer_texts.parquet"
    else:
        return corpus_dir(ds) / f"answer_texts_{i}.parquet"


def corpus(ds, tokenized=True):
    if tokenized:
        return corpus_dir(ds) / "corpus_tokenized.parquet"
    else:
        return corpus_dir(ds) / "corpus.parquet"


# Embeddings


def embeddings_dir(ds):
    return dataset_dir(ds) / "embeddings"


def embedding_dir(ds, m):
    return embeddings_dir(ds) / m


def embedding(ds, m, c):
    return embedding_dir(ds, m) / f"{c}.{m}.npz"


# Features


def features_dir(ds):
    return dataset_dir(ds) / "features"


def feature(ds, f, i=None):
    if i is None:
        save = features_dir(ds) / f"{f}.parquet"
    else:
        save = features_dir(ds) / f / f"{i}.parquet"
    return save


# Candidates


def candidate_pairs_dir(ds):
    return ids_dir(ds) / "candidate_pairs"


def candidate_pairs(ds, i=None):
    if i is None:
        path = candidate_pairs_dir(ds) / "candidate_pairs.parquet"
    else:
        path = candidate_pairs_dir(ds) / "split" / f"candidate_pairs_{i}.parquet"
    return path


def train_candidate_pairs(ds, c, p=None, i=None):
    filename = f"train_candidate_pairs_{c}_candidates"

    if p is not None:
        p = percent_to_string(p)
        filename += f"_{p}_perc_dups"
    if i is not None:
        filename += f"_{i}"

    filename += ".parquet"

    return candidate_pairs_dir(ds) / filename


def test_candidate_pairs(ds):
    return candidate_pairs_dir(ds) / "test_candidate_pairs.parquet"


# Train/test sets


def train_sets_dir(ds):
    return dataset_dir(ds) / "train_sets"


def test_sets_dir(ds):
    return dataset_dir(ds) / "test_sets"


def train_set(ds, c, p, i=None):
    p = percent_to_string(p)
    return train_sets_dir(ds) / f"train_{c}_candidates_{p}_perc_dups.parquet"


def test_set(ds):
    return test_sets_dir(ds) / f"test.parquet"


# Cross-val results


def cv_results_dir(ds):
    return dataset_dir(ds) / "cv_results"


def cv_results(ds, c, p):
    p = percent_to_string(p)
    return cv_results_dir(ds) / f"cv_results_{c}_candidates_{p}_perc_dups.parquet"


####################
## Model dirs     ##
####################


def models_dir():
    return root_dir() / "models"


def dataset_models_dir(ds):
    return models_dir() / ds


# Feature models


def feature_model_dir(ds, m):
    return dataset_models_dir(ds) / m


def feature_model(ds, m, c):
    return feature_model_dir(ds, m) / f"{c}.{m}"


# Classifiers


def classifiers_dir(ds):
    return dataset_models_dir(ds) / "classifiers"


def classifier(ds, c, p):
    p = percent_to_string(p)
    return classifiers_dir(ds) / f"classifier_{c}_candidates_{p}_perc_dups.joblib"


####################
## Analysis dirs  ##
####################


def analysis_dir():
    return data_dir() / "analysis"


def analysis_file(f):
    return analysis_dir() / f


def candidates_evaluation():
    return analysis_file("candidates_evaluation.parquet")


def cross_dataset_performance():
    return analysis_file("cross_dataset_performance.parquet")


# Misclassified duplicates


def misclassified_duplicates_dir():
    return analysis_dir() / "misclassified_duplicates"


def misclassified_duplicates(ds):
    return misclassified_duplicates_dir() / f"{ds}_misclassified.parquet"


# Question pair ranks


def pair_ranks_dir():
    return analysis_dir() / "duplicate_pair_ranks"


def pair_ranks_dataset_dir(ds):
    return pair_ranks_dir() / ds


def pair_ranks(ds, m, c):
    return pair_ranks_dataset_dir(ds) / f"{m}_{c}.parquet"


def all_pair_ranks(ds):
    return pair_ranks_dataset_dir(ds) / "all_pair_ranks.parquet"


In [6]:
import multiprocessing

# tags used for selecting gamedev questions on Stack Overflow
gamedev_tags = [
    "game-engine",
    "game-physics",
    "game-development",
    "gameobject",
    "2d-games",
    "unreal-engine4",
    "unreal-blueprint",
    "unreal-development-kit",
    "unrealscript",
    "unity3d",
    "unity5",
    "unity5.3",
    "unity3d-mecanim",
    "unity3d-terrain",
    "unityscript",
    "unity3d-2dtools",
    "unity3d-unet",
    "unity-webgl",
    "unity2d",
    "unity-editor",
    "unity3d-editor",
    "unity-networking",
    "unity3d-gui",
    "unity-ui",
    "unity3d-5",
]

# Seeds used for sampling Stack Overflow questions
so_sample_seeds = [5129, 1011, 3692, 2420, 5815]

# names of the Stack Overflow samples
so_samples = [f"so_samples/sample_{i}" for i, _ in enumerate(so_sample_seeds)]

gamedev_datasets = [
    "gamedev_se",
    "gamedev_so",
]

datasets = gamedev_datasets + so_samples

# Similarity measures / features used in the classifiers
features = [
    "jaccard",
    "tfidf",
    "bm25",
    "topic",
    "doc2vec",
    #'bertoverflow',
    #'mpnet'        
]

text_columns = [
    "title",
    "body",
    "tags",
    "title_body",
    "title_body_tags",
    "title_body_tags_answer",
]

# number of CPU cores to use
n_procs = 1

# percentage of fake duplicate questions in the train set
noise_percentage = 0.2

# number of candidates used for training and evaluating classifiers
n_candidates = [1500]

# percentages used for undersampling the train sed
undersampling_percentages = [0.01]

# percentage of the train/test split
split_percentage = 0.2

# best values we found for the undersampling / number of selected candidates
best_undersampling = 0.01
best_candidates = 1500

# number of iterations used during random HP tuning
search_n_iters = 1


In [7]:
import pandas as pd

read = pd.read_parquet


def save(df, path):
    if not path.exists():
        make_dir(path.parent)
    df.to_parquet(path)


def make_dir(path):
    path.mkdir(parents=True, exist_ok=True)

In [8]:
import numpy as np
from scipy.sparse import load_npz
from scipy.special import rel_entr
from scipy.spatial.distance import cdist
from gensim.matutils import jaccard_distance
from sklearn.metrics.pairwise import cosine_similarity

from . import paths as paths
from . import read, save
from .models.bm25 import BM25


class QuestionComp:
    """Class for comparing sets of questions using different similarity measures"""

    def __init__(self, dataset, model_name, column):
        self.dataset = dataset
        self.model_name = model_name
        self.column = column
        self.embeddings = None
        self.load_embedding()

    def load_embedding(self):
        """Loads the document representations for a dataset/model/column
        using an appropriate method
        """
        if self.model_name == "bm25":
            emb = BM25.load(
                paths.feature_model(self.dataset, self.model_name, self.column)
            )
        elif self.model_name == "jaccard":
            # creates sets of words
            emb = read(paths.corpus(self.dataset))[[self.column, "corpus_index"]]
            emb = emb.set_index("corpus_index")[self.column].apply(set)
        else:
            emb = load_npz(paths.embedding(self.dataset, self.model_name, self.column))

            # lda embeddings are not arrays
            if self.model_name == "lda":
                emb = emb.toarray()
        self.embeddings = emb

    def topic_sim(self, indexes, others):
        """Calculates topic similarity between sets of documents represented by indexes
        Each index indicates the position of the document in the corpus
        and serves to select its representation in the embedding matrix
        """
        embedding = self.embeddings[indexes]
        other_embeddings = self.embeddings[others]
        res = cdist(embedding, other_embeddings, metric="jensenshannon")
        # converting distance to similarity
        res = np.negative(res)
        res = np.add(1, res)
        return res

    def bm25_sim(self, indexes, others):
        """Calculates BM25 scores between sets of documents represented by indexes
        Each index indicates the position of the document in the corpus
        and serves to select its representation in the corpus representation
        in the BM25 class
        """

        def compare(i):
            return self.embeddings.compare_documents(i, others)

        return [compare(i) for i in indexes]

    def jac_sim(self, indexes, others):
        """Calculates Jaccard similarities between sets of documents represented by indexes
        Each index indicates the position of the document in the corpus
        and serves to select the corresponding set of words
        """

        def compare(i):
            embedding = self.embeddings.loc[i]
            other_embeddings = self.embeddings.loc[others]
            return other_embeddings.apply(
                lambda t: 1.0 - jaccard_distance(t, embedding)
            ).to_list()

        return [compare(i) for i in indexes]

    def cosine_sim(self, indexes, others):
        """Calculates cosine similarity between sets of documents represented by indexes
        Each index indicates the position of the document in the corpus
        and serves to select its representation in the embedding matrix
        This function is used for TF-IDF, Doc2Vec, MPNet and BertOverflow similarities
        """
        embedding = self.embeddings[indexes]
        other_embeddings = self.embeddings[others]
        return cosine_similarity(embedding, other_embeddings)

    def compare(self, indexes, others):
        """Compares sets of documents represented by indexes using the
        appropriate similarity function
        """
        # allows for passing a single index instead of a list of indexes
        single_index = type(indexes) in (int, np.int64)
        single_comp = type(others) in (int, np.int64)
        if single_index:
            indexes = [indexes]
        if single_comp:
            others = [others]

        if self.model_name == "jaccard":
            scores = self.jac_sim(indexes, others)
        elif self.model_name == "bm25":
            scores = self.bm25_sim(indexes, others)
        elif self.model_name == "lda":
            scores = self.topic_sim(indexes, others)
        else:
            scores = self.cosine_sim(indexes, others)

        # return a single value/array instead of a matrix
        if single_index:
            scores = scores[0]
            if single_comp:
                scores = scores[0]

        return scores


ImportError: attempted relative import with no known parent package

In [9]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""Modified implementation of the BM25 algorithm provided in Gensim 3.8.
https://github.com/RaRe-Technologies/gensim/blob/3.8.3/gensim/summarization/bm25.py

Modifications are mostly for performance increases and new functions for computing
scores for arbitrary indexes in the corpus

Also added functions to save and load the model
"""

"""This module contains function of computing rank scores for documents in
corpus and helper class `BM25` used in calculations. Original algorithm
descibed in [1]_, also you may check Wikipedia page [2]_.
.. [1] Robertson, Stephen; Zaragoza, Hugo (2009).  The Probabilistic Relevance Framework: BM25 and Beyond,
       http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf
.. [2] Okapi BM25 on Wikipedia, https://en.wikipedia.org/wiki/Okapi_BM25
Examples
--------
.. sourcecode:: pycon
    >>> from gensim.summarization.bm25 import get_bm25_weights
    >>> corpus = [
    ...     ["black", "cat", "white", "cat"],
    ...     ["cat", "outer", "space"],
    ...     ["wag", "dog"]
    ... ]
    >>> result = get_bm25_weights(corpus, n_jobs=-1)
Data:
-----
.. data:: PARAM_K1 - Free smoothing parameter for BM25.
.. data:: PARAM_B - Free smoothing parameter for BM25.
.. data:: EPSILON - Constant used for negative idf of document in corpus.
"""


import logging
import math
import joblib
from six import iteritems
from six.moves import range
from functools import partial
from multiprocessing import Pool

PARAM_K1 = 1.5
PARAM_B = 0.75
EPSILON = 0.25

logger = logging.getLogger(__name__)


def effective_n_jobs(n_jobs):
    """Determines the number of jobs can run in parallel.
    Just like in sklearn, passing n_jobs=-1 means using all available
    CPU cores.
    Parameters
    ----------
    n_jobs : int
        Number of workers requested by caller.
    Returns
    -------
    int
        Number of effective jobs.
    """
    if n_jobs == 0:
        raise ValueError("n_jobs == 0 in Parallel has no meaning")
    elif n_jobs is None:
        return 1
    elif n_jobs < 0:
        n_jobs = max(cpu_count() + 1 + n_jobs, 1)
    return n_jobs


class BM25(object):
    """Implementation of Best Matching 25 ranking function.
    Attributes
    ----------
    corpus_size : int
        Size of corpus (number of documents).
    avgdl : float
        Average length of document in `corpus`.
    doc_freqs : list of dicts of int
        Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values.
    idf : dict
        Dictionary with inversed documents frequencies for whole `corpus`. Words used as keys and frequencies as values.
    doc_len : list of int
        List of document lengths.
    """

    def __init__(self, corpus=None, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
        """
        Parameters
        ----------
        corpus : list of list of str
            Given corpus.
        k1 : float
            Constant used for influencing the term frequency saturation. After saturation is reached, additional
            presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
            that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
            the type of documents or queries.
        b : float
            Constant used for influencing the effects of different document lengths relative to average document length.
            When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
            [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
            depends on factors such as the type of documents or queries.
        epsilon : float
            Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
            negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
            score (with 'very common' meaning that it is present in more than half of the documents). That can be
            undesirable as it means that an identical document would score less than an almost identical one (by
            removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
            different documents) to receive an extra score.
        """

        self.k1 = k1
        self.b = b
        self.epsilon = epsilon

        self.corpus_size = 0
        self.avgdl = 0
        self.doc_freqs = []
        self.idf = {}
        self.doc_len = []

        if corpus is not None:
            self._initialize(corpus)

    def load(path):
        bm25 = BM25()
        bm25.__dict__ = joblib.load(path)
        return bm25

    def save(self, path):
        joblib.dump(self.__dict__, path)

    def _initialize(self, corpus):
        """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
        nd = {}  # word -> number of documents with word
        num_doc = 0
        for document in corpus:
            self.corpus_size += 1
            self.doc_len.append(len(document))
            num_doc += len(document)

            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.doc_freqs.append(frequencies)

            for word, freq in iteritems(frequencies):
                if word not in nd:
                    nd[word] = 0
                nd[word] += 1

        self.avgdl = float(num_doc) / self.corpus_size
        # collect idf sum to calculate an average idf for epsilon value
        idf_sum = 0
        # collect words with negative idf to set them a special epsilon value.
        # idf can be negative if word is contained in more than half of documents
        negative_idfs = []
        for word, freq in iteritems(nd):
            idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
            self.idf[word] = idf
            idf_sum += idf
            if idf < 0:
                negative_idfs.append(word)
        self.average_idf = float(idf_sum) / len(self.idf)

        if self.average_idf < 0:
            logger.warning(
                "Average inverse document frequency is less than zero. Your corpus of {} documents"
                " is either too small or it does not originate from natural text. BM25 may produce"
                " unintuitive results.".format(self.corpus_size)
            )

        eps = self.epsilon * self.average_idf
        for word in negative_idfs:
            self.idf[word] = eps

        doc_den = [
            self.k1 * (1 - self.b + self.b * dl / self.avgdl) for dl in self.doc_len
        ]
        num_constant = self.k1 + 1

        self.doc_scores = []

        for i, doc in enumerate(self.doc_freqs):
            self.doc_scores.append(
                {
                    w: df * self.idf[w] * num_constant / (df + doc_den[i])
                    for w, df in doc.items()
                }
            )

    def get_score(self, document, index):
        """Computes BM25 score of given `document` in relation to item of corpus selected by `index`.
        Parameters
        ----------
        document : list of str
            Document to be scored.
        index : int
            Index of document in corpus selected to score with `document`.
        Returns
        -------
        float
            BM25 score.
        """
        score = 0.0
        doc_freqs = self.doc_freqs[index]
        numerator_constant = self.k1 + 1
        denominator_constant = self.k1 * (
            1 - self.b + self.b * self.doc_len[index] / self.avgdl
        )
        for word in document:
            if word in doc_freqs:
                df = self.doc_freqs[index][word]
                idf = self.idf[word]
                score += (idf * df * numerator_constant) / (df + denominator_constant)
        return score

    def most_similar(self, index, n, include_self=False):
        """Computes and returns the highest BM25 scores for the given
        index in the corpus in relation to all other documents in the corpus
        Parameters
        ----------
        index: int
            Index of the document to be scored.
        n: int
            Number of most similar scores to return
        Returns
        -------
        list of float
            BM25 scores.
        """
        indexes = range(self.corpus_size)
        score = lambda i: self._compare_documents(index, i)

        if not include_self:
            indexes = filter(lambda i: i != index, indexes)

        return sorted(indexes, key=score, reverse=True)[:n]

    def _compare_documents(self, index1, index2):
        """Computes and returns the BM25 score for a pair of documents
        with the given indexes
        Parameters
        ----------
        index1: int
            Index of the document to be scored.
        index2: int
            Index of the other document to be scored against.
        Returns
        -------
        list of float
            BM25 scores.
        """
        doc1_freqs = self.doc_freqs[index1]
        doc2_scores = self.doc_scores[index2]

        in_common = doc1_freqs.keys() & doc2_scores.keys()

        score = 0.0
        for word in in_common:
            score += doc2_scores[word] * doc1_freqs[word]
        return score

    def compare_documents(self, index, indexes):
        """Computes and returns BM25 scores of given index in the corpus
        in relation to the given indexes
        Parameters
        ----------
        index: int
            Index of the document to be scored.
        indexes: list of int
            Indexes of the other documents used in scoring.
        Returns
        -------
        list of float
            BM25 scores.
        """
        return [self._compare_documents(index, i) for i in indexes]

    def compare_all_documents(self, index):
        """Computes and returns BM25 scores of given index in the corpus
        in relation to every other item in corpus.
        Parameters
        ----------
        index: int
            Index of the document to be scored.
        Returns
        -------
        list of float
            BM25 scores.
        """
        return self.compare_documents(index, range(self.corpus_size))

    def get_scores(self, document):
        """Computes and returns BM25 scores of given `document` in relation to
        every item in corpus.
        Parameters
        ----------
        document : list of str
            Document to be scored.
        Returns
        -------
        list of float
            BM25 scores.
        """
        scores = [self.get_score(document, index) for index in range(self.corpus_size)]
        return scores

    def get_scores_bow(self, document):
        """Computes and returns BM25 scores of given `document` in relation to
        every item in corpus.
        Parameters
        ----------
        document : list of str
            Document to be scored.
        Returns
        -------
        list of float
            BM25 scores.
        """
        scores = []
        for index in range(self.corpus_size):
            score = self.get_score(document, index)
            if score > 0:
                scores.append((index, score))
        return scores


def _get_scores_bow(bm25, document):
    """Helper function for retrieving bm25 scores of given `document` in parallel
    in relation to every item in corpus.
    Parameters
    ----------
    bm25 : BM25 object
        BM25 object fitted on the corpus where documents are retrieved.
    document : list of str
        Document to be scored.
    Returns
    -------
    list of (index, float)
        BM25 scores in a bag of weights format.
    """
    return bm25.get_scores_bow(document)


def _get_scores(bm25, document):
    """Helper function for retrieving bm25 scores of given `document` in parallel
    in relation to every item in corpus.
    Parameters
    ----------
    bm25 : BM25 object
        BM25 object fitted on the corpus where documents are retrieved.
    document : list of str
        Document to be scored.
    Returns
    -------
    list of float
        BM25 scores.
    """
    return bm25.get_scores(document)


def iter_bm25_bow(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
    """Yield BM25 scores (weights) of documents in corpus.
    Each document has to be weighted with every document in given corpus.
    Parameters
    ----------
    corpus : list of list of str
        Corpus of documents.
    n_jobs : int
        The number of processes to use for computing bm25.
    k1 : float
        Constant used for influencing the term frequency saturation. After saturation is reached, additional
        presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
        that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
        the type of documents or queries.
    b : float
        Constant used for influencing the effects of different document lengths relative to average document length.
        When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
        [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
        depends on factors such as the type of documents or queries.
    epsilon : float
        Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
        negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
        score (with 'very common' meaning that it is present in more than half of the documents). That can be
        undesirable as it means that an identical document would score less than an almost identical one (by
        removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
        different documents) to receive an extra score.
    Yields
    -------
    list of (index, float)
        BM25 scores in bag of weights format.
    Examples
    --------
    .. sourcecode:: pycon
        >>> from gensim.summarization.bm25 import iter_bm25_weights
        >>> corpus = [
        ...     ["black", "cat", "white", "cat"],
        ...     ["cat", "outer", "space"],
        ...     ["wag", "dog"]
        ... ]
        >>> result = iter_bm25_weights(corpus, n_jobs=-1)
    """
    bm25 = BM25(corpus, k1, b, epsilon)

    n_processes = effective_n_jobs(n_jobs)
    if n_processes == 1:
        for doc in corpus:
            yield bm25.get_scores_bow(doc)
        return

    get_score = partial(_get_scores_bow, bm25)
    pool = Pool(n_processes)

    for bow in pool.imap(get_score, corpus):
        yield bow
    pool.close()
    pool.join()


def get_bm25_weights(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
    """Returns BM25 scores (weights) of documents in corpus.
    Each document has to be weighted with every document in given corpus.
    Parameters
    ----------
    corpus : list of list of str
        Corpus of documents.
    n_jobs : int
        The number of processes to use for computing bm25.
    k1 : float
        Constant used for influencing the term frequency saturation. After saturation is reached, additional
        presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
        that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
        the type of documents or queries.
    b : float
        Constant used for influencing the effects of different document lengths relative to average document length.
        When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
        [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
        depends on factors such as the type of documents or queries.
    epsilon : float
        Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
        negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
        score (with 'very common' meaning that it is present in more than half of the documents). That can be
        undesirable as it means that an identical document would score less than an almost identical one (by
        removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
        different documents) to receive an extra score.
    Returns
    -------
    list of list of float
        BM25 scores.
    Examples
    --------
    .. sourcecode:: pycon
        >>> from gensim.summarization.bm25 import get_bm25_weights
        >>> corpus = [
        ...     ["black", "cat", "white", "cat"],
        ...     ["cat", "outer", "space"],
        ...     ["wag", "dog"]
        ... ]
        >>> result = get_bm25_weights(corpus, n_jobs=-1)
    """
    bm25 = BM25(corpus, k1, b, epsilon)

    n_processes = effective_n_jobs(n_jobs)
    if n_processes == 1:
        weights = [bm25.get_scores(doc) for doc in corpus]
        return weights

    get_score = partial(_get_scores, bm25)
    pool = Pool(n_processes)
    weights = pool.map(get_score, corpus)
    pool.close()
    pool.join()
    return weights


In [10]:
import numpy as np
from . import get_X_y
from ..consts import search_n_iters
from .scoring import multiple_k_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV


def get_folds(df, n):
    """Creates custom folds to be used in hyperparameter tuning
    These folds guarantee that no test dups are contained in the train
    folds

    If we use sklearn's default algorithm, it would randomly select pairs
    and the test dups have pairs in the train set, which could lead to
    leakage
    """
    dups = df[df.is_dup][["dup_id"]]
    dups = dups.drop_duplicates()

    # shuffle dups
    dups = dups.sample(frac=1, random_state=42)
    dups = dups.dup_id

    folds = []
    for test_dups in np.array_split(dups, n):
        test_ids = df[df.dup_id.isin(test_dups)]
        test_ids = test_ids.index

        train_ids = df[~df.dup_id.isin(test_dups)]
        train_ids = train_ids.index

        folds.append((train_ids, test_ids))
    return folds


def random_forest_tuner(folds):
    """Returns an instance of a RandomizedSearchCV with pre-defined
    parameters for a random forest
    """
    # parameters used for tuning random forests
    randomforest_hp_grid = {
        "n_estimators": [int(x) for x in np.linspace(50, 1000, num=50)],
        "max_depth": [int(x) for x in np.linspace(5, 100, num=50)],
        "min_samples_split": [2, 3, 5, 10],
        "min_samples_leaf": [1, 2, 5, 10],
        "bootstrap": [True, False],
        "class_weight": [None, "balanced", "balanced_subsample"],
        "random_state": [42],
    }

    est = RandomForestClassifier()
    return RandomizedSearchCV(
        est,
        randomforest_hp_grid,
        n_iter=search_n_iters,
        cv=folds,
        scoring=multiple_k_scorer,
        verbose=2,
        random_state=42,
        n_jobs=7,
        refit=False,
    )


def tune_train_set(train, n):
    """Tunes hyperparameters using the provided train set"""
    X, y = get_X_y(train)
    folds = get_folds(train, n)
    rs = random_forest_tuner(folds)

    return rs.fit(X, y)


ImportError: attempted relative import with no known parent package

In [11]:
def get_X_y(df):
    """Creates the X (features) and y (target) datasets
    to be used by the classifier
    """
    X = df.set_index(["candidate_id", "dup_id"])
    y = X["is_dup"]
    X = X.drop(columns=["is_dup", "score"])
    return X, y

In [12]:
import numpy as np
from .bm25 import BM25
from gensim.models import LdaMulticore
from gensim.models.doc2vec import Doc2Vec
from ..consts import n_procs


def doc2vec_model(corpus_path):
    """Defines a Doc2Vec model with predefined parameters"""
    return Doc2Vec(
        corpus_file=str(corpus_path),
        vector_size=30,
        window=15,
        min_count=5,
        workers=n_procs,
        seed=42,
        sample=1e-5,
        negative=1,
        epochs=25,
    )


def bm25_model(corpus):
    """Defines a BM25 model with predefined parameters"""
    return BM25(corpus, k1=0.05, b=0.03)


def lda_model(corpus, vocab):
    """Defines an LDA model with predefined parameters"""
    return LdaMulticore(
        corpus,
        random_state=42,
        id2word=vocab,
        alpha="symmetric",
        eta="auto",
        eval_every=5,
        num_topics=30,
        workers=n_procs,
        minimum_probability=0.0,
    )


def bertoverflow_model():
    """Loads the BERTOverflow model and converts it to a sentence transformer"""
    from sentence_transformers import SentenceTransformer, models

    bertoverflow = models.Transformer("jeniya/BERTOverflow")
    pooling_model = models.Pooling(bertoverflow.get_word_embedding_dimension())
    return SentenceTransformer(modules=[bertoverflow, pooling_model])


def mpnet_model():
    """Loads the MPNet model"""
    from sentence_transformers import SentenceTransformer

    return SentenceTransformer("paraphrase-mpnet-base-v2")

ImportError: attempted relative import with no known parent package

In [13]:
def recall_rate_k_scorer(df, col, k):
    """Scores a dataframe based on the column using the recall-rate@k measure"""

    def recall_rate_k(df):
        return df.sort_values(col, ascending=False)[:k]["is_dup"].any()

    return df.groupby("dup_id").apply(recall_rate_k).mean()


def predict_probabilities(estimator, X, y):
    """Uses the classifier to predict the classification probabilities using the set of features X"""
    df = X.reset_index()[["dup_id", "candidate_id"]]
    df = df.set_index(["candidate_id", "dup_id"])

    df["pred"] = [p[1] for p in estimator.predict_proba(X)]
    df["is_dup"] = y
    df = df.reset_index()
    
    return df


def multiple_k_scorer(estimator, X, y):
    """Uses the estimator to predict values of X, and evaluates it using multiple
    recall-rates measures
    """
    df = predict_probabilities(estimator, X, y)

    return {
        "rr@5": recall_rate_k_scorer(df, "pred", 5),
        "rr@10": recall_rate_k_scorer(df, "pred", 10),
        "rr@20": recall_rate_k_scorer(df, "pred", 20),
    }

In [14]:
# Importing scripts from the other dir
import os
import sys

path = os.path.abspath(os.getcwd())
scripts = path.rsplit('code', 1)[0] + 'code/scripts'
sys.path.insert(0, scripts)

import numpy as np
import pandas as pd
from utils import read, paths
from utils.consts import datasets, gamedev_datasets

def dataset_name(ds):
    """Returns the pretty version of the dataset name based
    on the string that represents it
    """
    if ds == 'gamedev_se':
        name = 'Game Dev. Stack Exchange'
    elif ds == 'gamedev_so':
        name = 'Stack Overflow (Game dev.)'
    else:
        name = 'Stack Overflow (General dev.)'
    return name

def dup_pair_similarity_summary():
    """Creates a summary table with all the recall-rates for all similarity scores
    and formats it
    """
    summary = []
    for ds in datasets:
        df = read(paths.all_pair_ranks(ds))
    
        sim_names = {
            "jaccard": "Jaccard",
            "bm25": "BM25",
            "tfidf": "TF-IDF",
            "doc2vec": "Doc2Vec",
            "topic": "Topic",
            "bertoverflow": "BERTOverflow",
            "mpnet": "MPNet",
        }
    
        # order in which to show results
        score_order = ["recall-rate@5", "recall-rate@10", "recall-rate@20"]
        doc_order = [
            "title",
            "body",
            "tags",
            "title_body",
            "title_body_tags",
            "title_body_tags_answer",
        ]
        sim_order = [
            "Jaccard",
            "TF-IDF",
            "BM25",
            "Topic",
            "Doc2Vec",
            "BERTOverflow",
            "MPNet",
        ]
    
        # rename features
        df.feature = df.feature.apply(lambda f: sim_names[f])
    
        df = df.pivot_table(values=score_order, columns="feature", index="col")
        df = df.transpose()
        df.columns = list(df.columns)
    
        df.index = df.index.set_names(["Score", "Similarity"])
        df = df[doc_order]
        # change column names to fit page
        df.columns = ["(1)", "(2)", "(3)", "(4)", "(5)", "(6)"]
        df = df.reindex(sim_order, level=1)
        df = df.reindex(score_order, level=0)
        # turn recall-rates into percentages with 2 decimals
        df = (df * 100).round(2)
        
        df["Dataset"] = dataset_name(ds)
    
        summary.append(df)
    
    summary = pd.concat(summary)
    summary = summary.reset_index()
    
    # calculate std and mean
    summary = summary.groupby(['Dataset', 'Score', 'Similarity']).agg([np.mean, np.std])
    
    # format values
    for c in set(summary.columns.get_level_values(0)):
        summary[(c, 'mean')] = summary[(c, 'mean')].apply(lambda f: "{:.2f}".format(f))
        summary[(c, 'std')] = summary[(c, 'std')].apply(lambda f: "" if pd.isna(f) else "({:.2f})".format(round(f, 2)))
        summary[(c, 'mean')] = summary[(c, 'mean')] + " " + summary[(c, 'std')]
        
    # drop columns + fix order
    summary = summary.iloc[:,summary.columns.get_level_values(1) == 'mean']
    summary.columns = summary.columns.droplevel(1)
    summary = summary.reindex(sim_order, level=2)
    summary = summary.reindex(score_order, level=1)
    
    return summary

def candidate_evaluation_summary():
    """Formats the dataframe containing the evaluation results for different numbers of candidates"""
    df = read(paths.candidates_evaluation())
    df['dataset'] = df['dataset_train'].apply(dataset_name)
    
    df = df.drop(columns='dataset_train')
    
    for c in [c for c in df.columns if 'rr' in c]:
        df[c] = (df[c]*100).round(2)
        
    df = df.groupby(['dataset', 'candidates']).agg([np.mean, np.std])
    
    # format values
    for c in set(df.columns.get_level_values(0)):
        df[(c, 'mean')] = df[(c, 'mean')].apply(lambda f: "{:.2f}".format(f))
        df[(c, 'std')] = df[(c, 'std')].apply(lambda f: "" if pd.isna(f) else "({:.2f})".format(round(f, 2)))
        df[(c, 'mean')] = df[(c, 'mean')] + " " + df[(c, 'std')]
        
    # drop columns + fix order and naming
    df = df.iloc[:,df.columns.get_level_values(1) == 'mean']
    df.columns = df.columns.droplevel(1)
    df.columns = pd.MultiIndex.from_product([['Recall-rate@'], [c[3:] for c in df.columns]])
    return df

def dataset_sizes():
    """Summarizes the sizes of the datasets used in the study"""
    df = []
    for ds in datasets:
        n_qs = len(read(paths.all_question_ids(ds)))
        pairs = read(paths.dup_pairs(ds))
        n_dups = len(read(paths.duplicate_question_ids(ds)))
        
        if ds == 'gamedev_se':
            source = 'Stack Exchange'
        else:
            source = 'Stack Overflow'
            
        if 'so_samples' not in ds:
            topic = 'Game development'
        else:
            topic = 'General development'
        
        df.append({
            'Source': source,
            'Topic': topic,
            'Questions': n_qs,
            'Non-duplicates': n_qs-n_dups,
            'Duplicates': n_dups,
            'Pairs': len(pairs)
        })
        
    df = pd.DataFrame(df)
    df = df.groupby(['Source', 'Topic']).agg([np.mean, np.std])
    df = df.fillna(0)
    
    if (df.iloc[:,df.columns.get_level_values(1) == 'std'] == 0).all().all():
        print('All datasets have std == 0')
    
    df = df.iloc[:,df.columns.get_level_values(1) == 'mean']
    df.columns = df.columns.droplevel(1)
    df['dup_perc'] = df['Duplicates']/df['Questions']*100
    df['dup_perc'] = df['dup_perc'].apply(lambda x: round(x, 1))
    
    for c in df.columns:
        if c != 'dup_perc':
            df[c] = df[c].apply(lambda i: "{:,}".format(i))
            
    df['Duplicates'] = df['Duplicates'] + ' (' + df['dup_perc'].apply(str) + '%)'
    df = df.drop(columns='dup_perc')
    
    return df


def cross_dataset_summary():
    """Creates a table summarizing the performance of the classifiers
    when testing on other datasets
    """
    df = read(paths.cross_dataset_performance())
    df['dataset_train'] = df['dataset_train'].apply(dataset_name)
    df['dataset_test'] = df['dataset_test'].apply(dataset_name)

    for c in [c for c in df.columns if 'rr' in c]:
        df[c] = (df[c]*100).round(2)

    df = df.groupby(['dataset_train', 'dataset_test']).agg([np.mean, np.std])

    # format values
    for c in set(df.columns.get_level_values(0)):
        df[(c, 'mean')] = df[(c, 'mean')].apply(lambda f: "{:.2f}".format(f))
        df[(c, 'std')] = df[(c, 'std')].apply(lambda f: "" if pd.isna(f) else "({:.2f})".format(round(f, 2)))
        df[(c, 'mean')] = df[(c, 'mean')] + " " + df[(c, 'std')]

    # drop columns + fix order and naming
    df = df.iloc[:,df.columns.get_level_values(1) == 'mean']
    df.columns = df.columns.droplevel(1)
    df = df.sort_index(level=1)
    df.columns = pd.MultiIndex.from_product([['Recall-rate@'], [c[3:] for c in df.columns]])
    
    return df

def misclassified_summary():
    """Creates a table summarizing the duplicates that were misclassified by our classifiers"""
    def to_percentage(a, b):
        p = round(a/b*100)
        return f'({p}%)'
    
    df = {}
    
    df['Description'] = [
        'Duplicate pairs in test set',
        'Misclassified duplicate pairs',
        'Main question not in the list of candidates',
        'Top ranked question is an unlabelled duplicate',
        'Main question discusses a more general topic'
    ]
    
    for ds in gamedev_datasets + ['so_samples/sample_0']:
        test = read(paths.test_set(ds))
        dups_in_test = len(test.dup_id.unique())
        
        if 'so_samples' in ds:
            missed = read(paths.misclassified_duplicates('so_sample'))
        else:
            missed = read(paths.misclassified_duplicates(ds))
        
        n_misclassified = len(missed)
        no_candidate = len(missed[~missed.has_dup])
        top_ranked_dup = len(missed[missed.top_ranked_is_dup])
        main_more_general = len(missed[missed.main_more_generic])
        
        misc_perc = to_percentage(n_misclassified, n_misclassified)
        no_candidate_perc = to_percentage(no_candidate, n_misclassified)
        top_ranked_perc = to_percentage(top_ranked_dup, n_misclassified)
        more_general_perc = to_percentage(main_more_general, n_misclassified)
    
        n_misclassified = str(n_misclassified) + ' ' + misc_perc
        no_candidate = str(no_candidate) + ' ' + no_candidate_perc
        top_ranked_dup = str(top_ranked_dup) + ' ' + top_ranked_perc
        main_more_general = str(main_more_general) + ' ' + more_general_perc
        
        ds = dataset_name(ds)
        
        df[ds] = [dups_in_test, n_misclassified, no_candidate, top_ranked_dup, main_more_general]
    
    df = pd.DataFrame(df)
    return df

In [15]:
from create_tables import *
pd.set_option('display.max_rows', 500)

ModuleNotFoundError: No module named 'create_tables'