In [1]:
import pandas as pd

This notebook reads data from `data/{website}/{topics,unprocessed}/` and writes to `data/{website}/`

In [2]:
def read_table(w, p):
    topics = pd.read_parquet(f'../data/{w}/topics/{p}.parquet')
    df     = pd.read_parquet(f'../data/{w}/unprocessed/{p}.parquet').drop(columns='text')
    df     = df.merge(topics, on='id', how='left')
    return df

def proc_questions(qsts, answ, cmts):
    resolved = answ.groupby('question_id').is_accepted.sum().reset_index().rename(columns={'question_id': 'id'})
    qsts = qsts.drop(columns='is_resolved').merge(resolved, on='id', how='left').rename(columns={'is_accepted': 'is_resolved'})
    qsts.is_resolved = qsts.is_resolved.fillna(0)
    qsts.is_resolved = qsts.is_resolved.apply(lambda x: 1 if x > 0 else 0)

    n_answ   = answ.groupby('question_id').id.count().reset_index().rename(columns={'id': 'n_answers', 'question_id': 'id'})
    n_cmts   = cmts.groupby('question_id').id.count().reset_index().rename(columns={'id': 'n_comments', 'question_id': 'id'})

    qsts = qsts.merge(n_answ, on='id', how='left').merge(n_cmts, on='id', how='left')
    qsts['n_answers'] = qsts.n_answers.fillna(0)
    qsts['n_comments'] = qsts.n_comments.fillna(0)
    qsts['n_responses'] = (qsts.n_answers + qsts.n_comments).fillna(0)
    qsts['is_answered'] = qsts.n_answers.apply(lambda a: 0 if a == 0 else 1)
    
    qsts['len_text'] = qsts.text.apply(lambda t: t if pd.isna(t) else len(t))
    qsts['has_code'] = qsts.text.str.contains('codesnippet').apply(lambda b: b if pd.isna(b) else int(b))
    
    return qsts

def proc_answers(answ):
    answ['len_text'] = answ.text.apply(lambda t: t if pd.isna(t) else len(t))
    answ['has_code'] = answ.text.str.contains('codesnippet').apply(lambda b: b if pd.isna(b) else int(b))
    answ['is_accepted'] = answ.is_accepted.apply(lambda x: 1 if x or x == 1 else 0)
    
    return answ

def proc_comments(cmts):
    cmts['len_text'] = cmts.text.apply(lambda t: t if pd.isna(t) else len(t))
    cmts['has_code'] = cmts.text.str.contains('codesnippet').apply(lambda b: b if pd.isna(b) else int(b))
    
    return cmts

In [3]:
for w in ['unity', 'ue4', 'stackoverflow', 'gamedev_se']:
    qsts = read_table(w, 'questions')
    answ = read_table(w, 'answers')
    cmts = read_table(w, 'comments')
    
    qsts = proc_questions(qsts, answ, cmts)
    answ = proc_answers(answ)
    cmts = proc_comments(cmts)
    
    qsts.to_csv(f'../data/{w}/questions.csv', index=False)
    answ.to_csv(f'../data/{w}/answers.csv', index=False)
    cmts.to_csv(f'../data/{w}/comments.csv', index=False)