In [2]:
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary

import numpy as np
import pandas as pd

This notebook reads data from `data/{website}/texts/corpus.parquet` and writes to `data/{website}/topics/` and `lda_models/`

In [2]:
def vocab_bow(corpus):
    vocab  = Dictionary(corpus.words)
    bow    = corpus.words.apply(vocab.doc2bow)
    return vocab, bow

def make_corpus(websites):
    corpus = []
    for w in websites:
        df = pd.read_parquet(f'../../data/{w}/texts/corpus.parquet')
        df['source'] = w
        corpus.append(df)
    return pd.concat(corpus).reset_index(drop=True)

def train_lda(websites=[], save_path='', n_topics=0, **kwargs):
    corpus = make_corpus(websites)
    vocab, bow = vocab_bow(corpus)
    lda = LdaModel(
        bow,
        random_state=42,
        id2word=vocab,
        alpha='auto',
        eta='auto',
        passes=1,
        eval_every=5,
        num_topics=n_topics)

    lda.save(save_path)
    
def main_topic(ts, excluded={}):
    ts = [t for t in ts if t[0] not in excluded]
    
    if len(ts) == 0:
        ts = np.nan
    else:
        ts = max(ts, key=lambda t: t[1])[0]
    return ts

def assign_topics(corpus, lda_path):
    lda = LdaModel.load(lda_path)
    _, bow = vocab_bow(corpus)

    corpus['topics'] = bow.apply(lambda l: lda[l])

def topic_words(lda_path, n_words):
    def string_to_list(s):
        split = s.split('+')
        words = [w.split('*')[1] for w in split]
        words = [w.strip().strip('"') for w in words]
        return words
    
    lda = LdaModel.load(lda_path)    
    topics = lda.show_topics(num_topics=lda.num_topics, num_words=n_words)
    words = [(a, string_to_list(b)) for a, b in topics]
    words = pd.DataFrame(words, columns=['topic', 'words'])
    return words

In [3]:
lda_uu ={
    'websites': ['unity', 'ue4'],
    'save_path': '../lda_models/lda_unity_ue4.model',
    'n_topics': 30,
    'excluded_topics': {10, 18, 24}
}

lda_ss ={
    'websites': ['stackoverflow', 'gamedev_se'],
    'save_path': '../lda_models/lda_so_se.model',
    'n_topics': 30,
    'excluded_topics': {24, 19, 29, 3, 4}
}

lda = lda_uu # <- change this to run for different websites

In [4]:
#train_lda(**lda) # 56 min for Unity+UE4, 11 min for SO + SE

In [5]:
corpus = make_corpus(lda['websites'])
assign_topics(corpus, lda['save_path'])

In [6]:
corpus['topic'] = corpus['topics'].apply(lambda t: main_topic(t, lda['excluded_topics']))
corpus.loc[corpus.words.apply(len) == 0, 'topic'] = None

In [7]:
words = topic_words(lda['save_path'], 15)

In [9]:
corpus = corpus[['id', 'source', 'post_type', 'topic']].copy()

In [10]:
lda_ss['topic_map'] = {
    0:  'Event handling',
    1:  'File management',
    2:  'Game environments',
    5:  'Collisions',
    6:  'Object Oriented Programming',
    7:  'Positioning',
    8:  'GUI',
    9:  'Game engines',
    10: 'Viewport',
    11: 'Publishing apps',
    12: 'Movement',
    13: 'Android development',
    14: 'Camera',
    15: 'Geometry',
    16: 'General programming',
    17: 'Sound/audio',
    18: 'User accounts',
    20: 'Game objects',
    21: '3D modeling',
    22: 'Rendering',
    23: 'Character animation',
    25: 'Networking',
    26: 'Display',
    27: 'Game mechanics',
    28: 'Runtime',
}

lda_uu['topic_map'] = {
    0:  'General programming',
    1:  'Game objects',
    2:  'Meshes',
    3:  'Bug reports',
    4:  'Programming errors',
    5:  'Event handling',
    6:  'Geometry',
    7:  'Game loop',
    8:  'Networking',
    9:  'GUI',
    11: 'Collisions',
    12: 'Object Oriented Programming',
    13: 'Character animation',
    14: 'Materials',
    15: 'Game engines',
    16: 'Runtime',
    17: 'Rendering',
    19: 'General errors',
    20: 'Positioning',
    21: '3D modeling',
    22: 'Camera and display',
    23: 'FX',
    25: 'Lighting',
    26: 'Game mechanics',
    27: 'Movement',
    28: 'External tools',
    29: 'File management',
}

corpus['topic_label'] = corpus.topic.apply(lambda n: n if pd.isna(n) else lda['topic_map'][n])

In [11]:
gen_dev = 'General software development'
bugs = 'Bugs, crashes, and errors'
games = 'Game development'

categories = {
    'Event handling': gen_dev,
    'File management': gen_dev,
    'Game environments': games,
    'Collisions': games,
    'Object Oriented Programming': gen_dev,
    'Positioning': games,
    'Player controls': games,
    'Game engines': games,
    'Viewport': games,
    'Publishing apps': gen_dev,
    'Movement': games,
    'Android development': gen_dev,
    'Camera and display': games,
    'Camera': games,
    'Geometry': games,
    'General programming': gen_dev,
    'Sound/audio': games,
    'User accounts': gen_dev,
    'Game objects': games,
    '3D modeling': games,
    'Rendering': games,
    'Character animation': games,
    'Networking': gen_dev,
    'Display': games,
    'Game mechanics': games,
    'Runtime': gen_dev,
    'Meshes': games,
    'Bug reports': bugs,
    'Programming errors': bugs,
    'Game loop': games,
    'GUI': games,
    'Materials': games,
    'General errors': bugs,
    'FX': games,
    'Lighting': games,
    'External tools': games,
}

corpus['category'] = corpus.topic_label.apply(lambda l: l if pd.isna(l) else categories[l])

In [12]:
for s in corpus.source.unique():
    for p in corpus.post_type.unique():
        df = corpus[(corpus.source == s) & (corpus.post_type == p)]
        df = df.drop(columns=['source', 'post_type'])
        df.to_parquet(f'../../data/{s}/topics/{p}s.parquet')