In [1]:
import pandas as pd
from scipy.stats import mannwhitneyu

This notebook reads data from `data/{website}/` and writes to `data/topic_comparisons/`

In [2]:
# Code for cliffsDelta is from https://github.com/neilernst/cliffsDelta/blob/master/cliffsDelta.py

def cliffsDelta(lst1, lst2):
    m, n = len(lst1), len(lst2)
    lst2 = sorted(lst2)
    j = more = less = 0
    for repeats, x in runs(sorted(lst1)):
        while j <= (n - 1) and lst2[j] < x:
            j += 1
        more += j*repeats
        while j <= (n - 1) and lst2[j] == x:
            j += 1
        less += (n - j)*repeats
    d = (more - less) / (m*n)
    return d

def runs(lst):
    """Iterator, chunks repeated values"""
    for j, two in enumerate(lst):
        if j == 0:
            one, i = two, 0
        if one != two:
            yield j - i, one
            i = j
        one = two
    yield j - i + 1, two
    
def compare(topic, others, res):
    try:
        if mannwhitneyu(topic, others, alternative='greater').pvalue < thr:
            res['difference'] = 'greater'
        elif mannwhitneyu(topic, others, alternative='less').pvalue < thr:
            res['difference'] = 'less'
    except ValueError:
        pass

    if 'difference' in res:
        d = cliffsDelta(topic, others)

        if d <= 0.147:
            res['delta'] = 'negligible'
        elif d <= 0.33:
            res['delta'] = 'small'
        elif d <= 0.474:
            res['delta'] = 'medium'
        else:
            res['delta'] = 'large'

        if res['delta'] != 'negligible':
            res['mean'] = topic.mean()
            res['mean_others'] = others.mean()
            res['diff_mean'] = res['mean_others']-res['mean']

            res['med'] = topic.median()
            res['med_others'] = others.median()
            res['diff_med']  = res['med_others']-res['med']

def one_vs_all(df, aspects, thr):
    def get_stats(topic_label, aspect):
        topic   = df[df.topic_label == topic_label][aspect]
        others  = df[df.topic_label != topic_label][aspect]
        res = {}
        
        res['topic'] = topic_label
        res['aspect'] = aspect
        
        compare(topic, others, res)
        
        return res
    
    df = df[~df.topic.isna()]
    topics = df.topic_label.unique()
    comparisons = []
    
    for t in topics:
        for a in aspects:
            comparisons.append(get_stats(t, a))
            
    return pd.DataFrame(comparisons)

def compare_websites(df1, df2, aspects, thr):
    def get_stats(topic_label, aspect):
        topic1 = df1[df1.topic_label == topic_label][aspect]
        topic2 = df2[df2.topic_label == topic_label][aspect]
        res = {}
        
        res['topic'] = topic_label
        res['aspect'] = aspect
        
        compare(topic1, topic2, res)
        
        return res
    
    df1 = df1[~df1.topic.isna()]
    df2 = df2[~df2.topic.isna()]
    topics = df1.topic_label.unique()
    comparisons = []
    
    for t in topics:
        for a in aspects:
            comparisons.append(get_stats(t, a))
            
    return pd.DataFrame(comparisons)

def one_vs_all_categories(df, aspects, thr):
    def get_stats(cat, aspect):
        category = df[df.category == cat][aspect]
        others   = df[df.category != cat][aspect]
        res = {}
        
        res['category'] = cat
        res['aspect'] = aspect
        
        compare(category, others, res)
        
        return res
    
    df = df[~df.category.isna()]
    categories = df.category.unique()
    comparisons = []
    
    for t in categories:
        for a in aspects:
            comparisons.append(get_stats(t, a))
            
    return pd.DataFrame(comparisons)

def compare_websites_categories(df1, df2, aspects, thr):
    def get_stats(cat, aspect):
        cat1 = df1[df1.category == cat][aspect]
        cat2 = df2[df2.category == cat][aspect]
        res = {}
        
        res['category'] = cat
        res['aspect'] = aspect
        
        compare(cat1, cat2, res)
        
        return res
    
    df1 = df1[~df1.category.isna()]
    df2 = df2[~df2.category.isna()]
    categories = df1.category.unique()
    comparisons = []
    
    for t in categories:
        for a in aspects:
            comparisons.append(get_stats(t, a))
            
    return pd.DataFrame(comparisons)

In [3]:
aspects = {
    'questions': ["n_responses", "len_text", "is_answered", "is_resolved", "has_code"],
    'answers': ["is_accepted", "len_text", "has_code"],
    'comments': ["len_text", "has_code"]
}

In [4]:
comparisons = (27+27+25+25) * sum(len(l) for l in aspects.values())
thr = 0.05/comparisons

dfs = []
for w in ['unity', 'ue4', 'stackoverflow', 'gamedev_se']:
    print(w)
    for t in ['questions', 'answers', 'comments']:
        print(f'\t{t}')
        df = pd.read_csv(f'../data/{w}/{t}.csv')
        df = one_vs_all(df, aspects[t], thr)
        
        df['website'] = w
        df['post_type'] = t
        
        dfs.append(df)

df = pd.concat(dfs).reset_index(drop=True).to_csv('../data/topic_comparisons/topic_comparisons.csv', index=False)

unity
	questions
	answers
	comments
ue4
	questions
	answers
	comments
stackoverflow
	questions
	answers
	comments
gamedev_se
	questions
	answers
	comments


In [5]:
comparisons = (3+3+2+2) * sum(len(l) for l in aspects.values())
thr = 0.05/comparisons

dfs = []
for w in ['unity', 'ue4', 'stackoverflow', 'gamedev_se']:
    print(w)
    for t in ['questions', 'answers', 'comments']:
        print(f'\t{t}')
        df = pd.read_csv(f'../data/{w}/{t}.csv')
        df = one_vs_all_categories(df, aspects[t], thr)
        
        df['website'] = w
        df['post_type'] = t
        
        dfs.append(df)

df = pd.concat(dfs).reset_index(drop=True).to_csv('../data/topic_comparisons/category_comparisons.csv', index=False)

unity
	questions
	answers
	comments
ue4
	questions
	answers
	comments
stackoverflow
	questions
	answers
	comments
gamedev_se
	questions
	answers
	comments


In [6]:
dfs = []
comparisons = (25+27) * sum(len(l) for l in aspects.values())
thr = 0.05/comparisons

for w1, w2 in [('unity', 'ue4'), ('stackoverflow', 'gamedev_se')]:
    print(w1, w2)
    for t in ['questions', 'answers', 'comments']:
        print(f'\t{t}')
        df1 = pd.read_csv(f'../data/{w1}/{t}.csv')
        df2 = pd.read_csv(f'../data/{w2}/{t}.csv')
        
        df = compare_websites(df1, df2, aspects[t], thr*2)
        
        df['websites'] = w1 + '/' + w2
        df['post_type'] = t
        
        dfs.append(df)

df = pd.concat(dfs).reset_index(drop=True).to_csv('../data/topic_comparisons/websites_topic_comparisons.csv', index=False)

unity ue4
	questions
	answers
	comments
stackoverflow gamedev_se
	questions
	answers
	comments


In [7]:
dfs = []
comparisons = (3+2) * sum(len(l) for l in aspects.values())
thr = 0.05/comparisons
for w1, w2 in [('unity', 'ue4'), ('stackoverflow', 'gamedev_se')]:
    print(w1, w2)
    for t in ['questions', 'answers', 'comments']:
        print(f'\t{t}')
        df1 = pd.read_csv(f'../data/{w1}/{t}.csv')
        df2 = pd.read_csv(f'../data/{w2}/{t}.csv')
        
        df = compare_websites_categories(df1, df2, aspects[t], thr*2)
        
        df['websites'] = w1 + '/' + w2
        df['post_type'] = t
        
        dfs.append(df)

df = pd.concat(dfs).reset_index(drop=True).to_csv('../data/topic_comparisons/websites_categories_comparisons.csv', index=False)

unity ue4
	questions
	answers
	comments
stackoverflow gamedev_se
	questions
	answers
	comments
