In [1]:
import pandas as pd

In [2]:
# load FORUM scores for all policies & discussions
q_df = []
for ncomments in ['10', 'N']:
    for feature in ['lexdiv', 'sentcomp', 'sim', 'smog']:
        if ncomments == '10':
            c = 1
        elif ncomments == 'N':
            c = 4
        else:
            raise ValueError("Invalid value for ncomments")

        # Read the data from CSV file
        df = pd.read_csv(f'model_output/forum-scores/df_{feature}_list_{c}.csv', index_col=0)
        df = df.reset_index().melt(id_vars='index')
        df['n'] = ncomments
        df['feature'] = feature
        q_df.append(df)

q_df = pd.concat(q_df)
q_df.head()

q_df['policy'] = q_df['variable'].str.split('_').str[0]
q_df['replies'] = q_df['variable'].str.split('_').str[-1]
q_df['pinned'] = q_df['variable'].str.contains('pinned').astype(int)

# Create dummy variables for policy, replies, and discussion
policy_dummies = pd.get_dummies(q_df['policy'], prefix='policy')
replies_dummies = pd.get_dummies(q_df['replies'], prefix='replies')

# Concatenate the dummy variables with the original DataFrame
q_df = pd.concat([q_df, policy_dummies, replies_dummies], axis=1)

# Drop unnecessary columns
q_df = q_df.rename(columns={'index': 'discussion', 'variable': 'sorting policy'})

# convert bool cols to int
boolcols = q_df.columns[q_df.dtypes == bool]
q_df[boolcols] = q_df[boolcols].astype(int)

q_df

Unnamed: 0,discussion,sorting policy,value,n,feature,policy,replies,pinned,policy_chron,policy_least-neg-abs,...,policy_pin-pred-xgb,policy_pos-abs,policy_pos-rel,policy_pred-nb,policy_pred-xgb,policy_random,policy_rev-chron,replies_rh,replies_rl,replies_rt
0,1,chron_rh,0.337720,10,lexdiv,chron,rh,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,2,chron_rh,-0.125603,10,lexdiv,chron,rh,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,3,chron_rh,0.133171,10,lexdiv,chron,rh,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,4,chron_rh,0.412392,10,lexdiv,chron,rh,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,5,chron_rh,0.039650,10,lexdiv,chron,rh,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172585,2611,random_pinned_rl,0.075738,N,smog,random,rl,1,0,0,...,0,0,0,0,0,1,0,0,1,0
172586,2612,random_pinned_rl,-0.133516,N,smog,random,rl,1,0,0,...,0,0,0,0,0,1,0,0,1,0
172587,2613,random_pinned_rl,-0.029820,N,smog,random,rl,1,0,0,...,0,0,0,0,0,1,0,0,1,0
172588,2614,random_pinned_rl,-0.058353,N,smog,random,rl,1,0,0,...,0,0,0,0,0,1,0,0,1,0


In [3]:
# Save the data
q_df.to_csv('data/q_df.csv', index=False)