In [1]:
import polars as pl
import pandas as pd
import altair as alt

path = "../../../data/users/summaries/combined/"
adj_matrix_path = path + 'adj_matrix-indirects-min-100.npz'
user_stats_path = path + 'user_stats.csv'
results_path = path + 'indirect_user_results.csv'

In [2]:
#get selected user characteristics
users = pl.read_csv(results_path).fill_nan(0).drop(["pg_rank", "first_date", "last_date"])

In [3]:
users.head()

user_name,no_posts,no_comments,post_karma,comment_karma,total_activity,avg_post_karma,avg_comment_karma,activity_window,longevity,indirect_pg_rank,cluster
str,i64,i64,i64,i64,i64,f64,f64,f64,f64,f64,i64
"""WanderingGreybush""",0,9451,0,132739,9451,0.0,14.044969,155.0,139.208333,0.000432,8
"""Kumquat_conniption""",0,5112,0,23942,5112,0.0,4.6834,184.0,150.208333,0.000672,8
"""PrincessToadTool""",2,4062,27,51113,4064,13.5,12.5832,369.083333,348.291667,0.000295,10
"""gregsw2000""",20,3379,5236,44325,3399,261.8,13.117786,196.0,155.208333,0.000223,8
"""starryvash""",8,3154,58,83632,3162,7.25,26.5161,94.0,77.208333,0.000207,8


In [4]:
#get all submissions

from pathlib import Path
import re

pattern = re.compile("\d+-\d.csv")
all_files = Path("../../../data/submissions/raw").glob("**/????-*.csv")
#sub_files = [f.absolute() for f in all_files if pattern.match(f.name)]

In [5]:
f = next(all_files)
df = pd.read_csv(f)

for f in all_files:
    new_df = pd.read_csv(f)
    df = pd.concat([df, new_df])

In [6]:
authors = pl.DataFrame(df[['id', 'author']])

In [7]:
topics = pl.read_csv("../../../data/submissions/raw/50th_labels.csv").select(["topic", "id"])

In [8]:
user_topics = users.select(pl.col("user_name")).unique().join(topics.join(authors, on="id"), left_on="user_name", right_on="author")

In [9]:
total_per_user = user_topics.groupby(["user_name"]).agg(pl.col("id").count().alias("total"))
user_by_topic = user_topics.groupby(["user_name", "topic"]).agg(pl.col("id").count().alias("count"))

In [10]:
counts_by_pg = users.select(["user_name", "indirect_pg_rank"]).join(user_by_topic, on="user_name").with_column((pl.col("count") * 100 * pl.col("indirect_pg_rank")).alias("weighted_count"))

In [11]:
all_posts = topics.groupby("topic").agg((pl.col("id").count() / len(topics)).alias("density")) \
    .with_column(pl.lit("All users").alias("type")).to_pandas()

selected_user_posts = user_by_topic.groupby("topic").agg(pl.col("count").sum()).with_column((pl.col("count") / pl.col("count").sum()).alias("density")) \
    .with_column(pl.lit("Most active users").alias("type")).to_pandas()

pg_weighted_posts = counts_by_pg.groupby("topic").agg(pl.col("weighted_count").sum().alias("count")).with_column((pl.col("count") / pl.col("count").sum()).alias("density")) \
    .with_column(pl.lit("Weighted by connectedness").alias("type")).to_pandas()

alt.Chart(pd.concat([all_posts, selected_user_posts, pg_weighted_posts])).mark_line().encode(
    x='topic:O', y='density', color=alt.Color("type", legend=alt.Legend(orient='bottom', title=""))
).properties(height = 300, width = 600)

In [None]:
pd.concat([all_posts, selected_user_posts, pg_weighted_posts]).to_csv(path + "user_topic_chart.csv")

## Setup for a Bayesian model

Let's try to see if there is any relationship between probability of "success" that a user will choose to post an entry of a specific topic as a function of the user attributes.

This will be the setup:

There are $n$ users that have $m$ covariates and post across $k$ topics. Then:

$Y$ (n x k) where $Y_i^j$ denotes observed count of topic $i$ posted by user $j$

$c$ (n x 1) where $c^j$ denotes total number of posts made by user $j$

$p$ (n x k) where $p_i^j$ denotes the probability that a post made by user $j$ is of topic $i$

$X$ (n x m) where $X_m^j$ denotes attribute $m$ of user $j$

$Y \sim  Multinomial(p, c)$

$p = softmax(Z)$

$Z = \alpha + X \beta$

$\beta$ (k x m) $\sim Normal(0, scale=0.001)$

$\alpha$ (k x 1) $\sim Normal(0, scale=0.001)$

 



In [169]:
c = total_per_user.sort(by='user_name')

In [172]:
X = users.join(total_per_user, on="user_name").drop(["cluster", "total"])

In [201]:
Y = pl.DataFrame(
    user_by_topic.to_pandas() \
    .pivot_table(values="count", index="user_name", columns="topic", aggfunc='sum', sort=True, fill_value=0) \
    .reset_index()
)

In [206]:
X.join(c, on="user_name").join(Y, on="user_name").to_pandas().to_csv(path + "for_multinomial.csv", index=False)