## Visualization of topic distributions across user groups

In [1]:
import polars as pl
import pandas as pd
import altair as alt
import sqlite3
from pathlib import Path
import re

path = "../../data/"
DB_PATH = path + 'users/users.sqlite.db'

In [2]:
conn = sqlite3.connect(Path(DB_PATH).absolute())

#push submission_id authors into SQLite
pattern = re.compile("\d+-\d.csv")
all_files = Path(path + "submissions/raw").glob("**/????-*.csv")
df = pd.concat([pd.read_csv(f) for f in all_files])    
df[['id', 'author']].to_sql("authors", conn, index=False, if_exists='replace')

#push topic labels into SQLite
df = pd.read_csv(path + "submissions/raw/50th_labels.csv")
df[["topic", "id"]].to_sql("topics", conn, index=False, if_exists="replace")

cur = conn.cursor()
cur.execute("CREATE INDEX IF NOT EXISTS sub_id ON topics(id)")
cur.execute("CREATE INDEX IF NOT EXISTS sub_id ON authors(id)")
cur.execute("CREATE INDEX IF NOT EXISTS user_name_index ON authors(author)")
cur.execute("CREATE INDEX IF NOT EXISTS topic_index ON topics(topic)")

<sqlite3.Cursor at 0x7f21882def10>

In [3]:
cur.execute("SELECT topic, COUNT(id) FROM topics GROUP BY topic ORDER BY topic ASC")
simple_counts = pd.DataFrame(cur.fetchall(), columns=['topic', "count"])
simple_counts['type'] = 'All users'
simple_counts['density'] = simple_counts['count'] / simple_counts['count'].sum()

In [4]:
sql = """
SELECT topic, COUNT(topics.id) FROM topics 
JOIN authors ON topics.id = authors.id 
JOIN users ON users.user_name = authors.author
WHERE users.is_selected
GROUP BY topic ORDER BY topic ASC
"""

cur.execute(sql)
selected_user_counts = pd.DataFrame(cur.fetchall(), columns=['topic', "count"])
selected_user_counts['type'] = 'Most active users'
selected_user_counts['density'] = selected_user_counts['count'] / selected_user_counts['count'].sum()

In [15]:
sql = """
WITH user_level AS (
    SELECT topic, user_name, COUNT(topics.id) * direct_pg as c FROM topics 
    JOIN authors ON topics.id = authors.id 
    JOIN users ON users.user_name = authors.author
    WHERE users.is_selected
    GROUP BY topic, user_name
)

SELECT topic, SUM(c) FROM user_level
    GROUP BY topic 
    ORDER BY topic ASC
"""

cur.execute(sql)
direct_pg_counts = pd.DataFrame(cur.fetchall(), columns=['topic', "count"])
direct_pg_counts['type'] = 'Weighted by direct connectedness'
direct_pg_counts['density'] = direct_pg_counts['count'] / direct_pg_counts['count'].sum()

In [16]:
sql = """
WITH user_level AS (
    SELECT topic, user_name, COUNT(topics.id) * indirect_pg as c FROM topics 
    JOIN authors ON topics.id = authors.id 
    JOIN users ON users.user_name = authors.author
    WHERE users.is_selected
    GROUP BY topic, user_name
)

SELECT topic, SUM(c) FROM user_level
    GROUP BY topic 
    ORDER BY topic ASC
"""

cur.execute(sql)
indirect_pg_counts = pd.DataFrame(cur.fetchall(), columns=['topic', "count"])
indirect_pg_counts['type'] = 'Weighted by indirect connectedness'
indirect_pg_counts['density'] = indirect_pg_counts['count'] / indirect_pg_counts['count'].sum()

In [18]:
sql = """
WITH user_level AS (
    SELECT topic, user_name, COUNT(topics.id) as c FROM topics 
    JOIN authors ON topics.id = authors.id 
    JOIN users ON users.user_name = authors.author
    WHERE users.is_selected AND indirect_cluster = 10
    GROUP BY topic, user_name
)

SELECT topic, SUM(c) FROM user_level
    GROUP BY topic 
    ORDER BY topic ASC
"""

cur.execute(sql)
cluster10 = pd.DataFrame(cur.fetchall(), columns=['topic', "count"])
cluster10['type'] = 'User cluster #10'
cluster10['density'] = cluster10['count'] / cluster10['count'].sum()

In [19]:
all_data = pd.concat([simple_counts, selected_user_counts, direct_pg_counts, indirect_pg_counts, cluster10])

alt.Chart(all_data).mark_bar().encode(
    x=alt.X('type', title="",axis=alt.Axis(labelExpr="")), 
    y= alt.Y('density', axis=alt.Axis(grid=False)),
    color=alt.Color("type", legend=alt.Legend(orient='bottom', title="")),
    column=alt.Column("topic:O", spacing=4)
).properties(height = 300, width = 30)

In [41]:
topic_descriptions = [
    (0, "Complaints"), 
    (1, "Bosses and COVID"), 
    (2, "Minimum wages"), 
    (4, "Bosses"), 
    (7, "Schedules"), 
    (8, "Retweets"), 
    (10, "Hatred"), (16, "Labour unions"), (18, "Social movement")
]

selected_topics = pd.DataFrame(topic_descriptions, columns=['topic', 'Description'])

for_chart = pd.merge(selected_topics, all_data, on="topic")

alt.Chart(for_chart).mark_bar().encode(
    x=alt.X('type', title="",axis=alt.Axis(labelExpr="")), 
    y= alt.Y('density', axis=alt.Axis(grid=False)),
    color=alt.Color("type", legend=alt.Legend(orient='bottom', title="")),
    column=alt.Column("Description", spacing=10, title="Topics")
).properties(height = 300, width = 60)