In [1]:
import re
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from gensim.parsing.preprocessing import preprocess_string

This notebook reads data from `data/{website}/unprocessed/` and writes to `data/{website}/texts/`

In [2]:
data_path = Path('../../data')

In [3]:
def process_html(t):
    t = t.lower()
    t = re.sub(r'\n', '', t) # replace code
    t = re.sub(r'<code>.*?</code>', ' codesnippet ', t) # replace code
    t = re.sub(r'<a.*?https?:\/\/.*?[\b\s]?>', ' url ', t) # replace urls
    t = re.sub(r'https?:\/\/.*?(?:[\b\s]|$)', ' url ', t) # replace urls
    t = re.sub(r'<img.*?>', ' img ', t) # replace urls
    t = BeautifulSoup(t, features="lxml").get_text()    # remove html tags (better than gensim)

    return t

def preprocess_table(df):
    print('\t\t- Removing NaNs')
    df.dropna(subset=['text'], inplace=True)
    print('\t\t- Removing HTML tags, URLs')
    df['text']  = df['text'].apply(process_html)
    print('\t\t- Tokenizing, stemming')
    df['words'] = df['text'].apply(preprocess_string)
    
def remove_foreign_text(ts):
    foreign_qids = ts['questions'][ts['questions']['section'].isin(['Japanese', 'Chinese', 'Korean'])]['id']

    ts['questions'] = ts['questions'][~ts['questions']['id'].isin(foreign_qids)].copy()
    ts['answers'] = ts['answers'][~ts['answers']['question_id'].isin(foreign_qids)].copy()
    ts['comments']  = ts['comments'][~ts['comments']['question_id'].isin(foreign_qids)].copy()

def preprocess_texts(db_name):
    print(f'Preprocessing texts for {db_name}')
    
    table_names = ['questions', 'answers', 'comments']
    tables = {t: pd.read_parquet(data_path / f'{db_name}/unprocessed/{t}.parquet') for t in table_names}
    
    if db_name == 'ue4':
        remove_foreign_text(tables)
    
    tables['questions']['text'] = tables['questions']['title'] + ' ' + tables['questions']['text']
    
    tables['questions'] = tables['questions'][['id', 'text']]
    tables['answers'] = tables['answers'][['id', 'text']]
    tables['comments'] = tables['comments'][['id', 'text']]
    
    for name, table in tables.items():
        print(f'\t- Processing text for table {name}')
        preprocess_table(table)
        table.to_parquet(data_path / f'{db_name}'/ 'texts' / f'{name}.parquet')
        
    tables['questions']['post_type'] = 'question'
    tables['answers']['post_type']   = 'answer'
    tables['comments']['post_type']  = 'comment'
    pd.concat(tables.values()).to_parquet(data_path / f'{db_name}'/ 'texts' / 'corpus.parquet')

In [4]:
websites = ['unity', 'ue4', 'stackoverflow', 'gamedev_se']

In [6]:
for w in websites:
    preprocess_texts(w)

Preprocessing texts for unity
	- Processing text for table questions
		- Removing NaNs
		- Removing HTML tags, URLs
		- Tokenizing, stemming
	- Processing text for table answers
		- Removing NaNs
		- Removing HTML tags, URLs
		- Tokenizing, stemming
	- Processing text for table comments
		- Removing NaNs
		- Removing HTML tags, URLs
		- Tokenizing, stemming
Preprocessing texts for ue4
	- Processing text for table questions
		- Removing NaNs
		- Removing HTML tags, URLs
		- Tokenizing, stemming
	- Processing text for table answers
		- Removing NaNs
		- Removing HTML tags, URLs
		- Tokenizing, stemming
	- Processing text for table comments
		- Removing NaNs
		- Removing HTML tags, URLs
		- Tokenizing, stemming
Preprocessing texts for stackoverflow
	- Processing text for table questions
		- Removing NaNs
		- Removing HTML tags, URLs
		- Tokenizing, stemming
	- Processing text for table answers
		- Removing NaNs
		- Removing HTML tags, URLs
		- Tokenizing, stemming
	- Processing text for t