In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import h5py
import os

In [2]:
# define data location
data_path = "../data/"

In [3]:
def gini_coefficient(x):
    """Compute Gini coefficient of array of values x"""
    diffsum = 0
    for i, xi in enumerate(x[:-1], 1):
        diffsum += np.sum(np.abs(xi - x[i:]))
    return diffsum / (len(x)**2 * np.mean(x))

In [4]:
def dyn_rep_aggregates(community,save_path):
    '''
    Aggregation of dynamical reputation within community.
    Input community (SE website) name and save_path for resulting dataframe.
    Output dataframe containing following columns:
    - Day - integer days, corresponding to columns of dynamical reputation input file
    - Number of active users - users with reputation above some threshold, default 1
    - Mean user reputation - among active users
    - Gini coefficient - among all users
    - Gini coefficient active users - among active users only
    '''
    
    # reputation data load
    reputation_path = data_path+'reputations/'
    reputation_file_name = '%s_first_180_days_eng_reputation.csv'%(community)
    rep_data = pd.read_csv(reputation_path+reputation_file_name,index_col=0)

    # dynamical reputation threshold value for user to be called active
    threshold = 1
    
    dr_df = pd.DataFrame(columns=['Day','Number of active users','Mean user reputation','Gini coefficient', 'Gini coeficient active users'])
    
    # number of users with reputation above some threshold
    dr_df['Number of active users'] = (rep_data>threshold).sum(axis=0)
    # mean user reputation only among active users
    dr_df['Mean user reputation'] = rep_data[rep_data>threshold].mean()
    # gini coefficient among users' reputations
    dr_df['Gini coefficient'] = rep_data.apply(gini_coefficient,axis=0)
    # gini coeffieicent based only on active users' reputation
    dr_df['Gini coeficient active users'] = rep_data[rep_data>threshold].apply(gini_coefficient,axis=0)
    dr_df['Day'] = rep_data.columns.astype(int)

    filename = '%s_dynamic_reputation_aggregates.csv'%(community)
    os.makedirs(save_path, exist_ok=True)
    dr_df.to_csv(save_path+filename,index=False)
    return dr_df

We run this function for all four pairs of communities:

In [5]:
communities = ['astronomy','economics','literature','physics',
               '052012astronomy','052012economics','052012-literature','052012-theoretical-physics']
for comm in communities:
    dyn_rep_aggregates(comm, data_path+'processed data/')

In [6]:
def active_questions_within_window(community,save_path,twin=7):
    """
    This function counts active questions (those with an answer, comment or accepted answer) within a time window specified by twin.
    Default is weekly active question count.
    Iput:
    - net: can be 'launched' or 'area51'
    - community: name of SE site, together with net it is used to get appropriate input files that contain reputations and interactions
    - save_path: path to directory where output dataframe should be saved
    - twin: time window within which interactions are aggregated, default is 7 days
    Output:
    - if save_path is not provided dataframe is returned, otherwise it is saved at save_path location
    DataFrame contains columns
    - First day of twin sliding window
    - Number of active questions
    """
    
    # import interactions via questions, answers and comments for network:
    interactions_path = data_path+'/interactions/'
    qa = pd.read_csv(interactions_path + '%s/%s_interactions_questions_answers.csv'%(community,community))
    comm = pd.read_csv(interactions_path + '%s/%s_interactions_comments.csv'%(community,community))
    acc = pd.read_csv(interactions_path + '%s/%s_interactions_acc_answers.csv'%(community,community))
    
    data = []
    
    for t in range(180-twin):
        temp_data = []
        temp_data.append(t)
        
        qnaT = qa[(qa['days']>=t)&(qa['days']<t+twin)].dropna().copy()
        commT = comm[(comm['days']>=t)&(comm['days']<t+twin)].dropna().copy()
        accT = acc[(acc['days']>=t)&(acc['days']<t+twin)].dropna().copy()
        
        numbQ = len(set(qnaT.QId).union(set(commT.QId)).union(set(accT.QId)))
        temp_data.append(numbQ)

        data.append(temp_data)
    
    data_df = pd.DataFrame(data, columns = ['FirstDay','Number_of_active_questions'])
    if save_path:    
        filename = '%s_weekly_active_questions.csv'%(community)
        os.makedirs(save_path, exist_ok = True)
        data_df.to_csv(save_path+filename,index=False)
    else:    
        return(data_df)

In [7]:
communities = ['astronomy','economics','literature','physics',
               '052012astronomy','052012economics','052012-literature','052012-theoretical-physics']
for comm in communities:
    active_questions_within_window(comm, data_path+'processed data/active questions/')

In [8]:
def extract_network_and_dyn_reputation_features(net,community,save_path,twin=30):
    '''
    This function constructs undirected unweighted network based on question, answer and comment interactions.
    Some network features are extracted and correlated with dynamical reputation. Resulting dataframe contains 
    features for first 180 days split into 150 30day sliding windows.
    Inputs:
    - net: can be "launched" or "area51"
    - community: name of SE site, together with net it is used to get appropriate input files that contain reputations and interactions
    - save_path: path to directory where output dataframe should be saved
    - twin: time window within which interactions are aggregated, default is 30 days
    Outputs:
    - if save_path is not provided ('') dataframe is returned, otherwise it is saved
    DataFrame contains following columns
    - First day of twin sliding window
    - Clustering_coef - average clustering coeficent in the interaction network
    - DynRep_Degree_corr - correlation coeficient between user's degree in the interaction network and their dynamical reputation at the last day in twin
    - DynRep_BC_corr - as above but instead of degree, node betweenness centrality is used
    - DynRep_Assortativity - assortativity between users dynamical reputations within the interaction network (e.g. for each network's edge correlation between nodes' dynamical reputations is calculated) 
    high positive assortativity means that users connect with users of similar reputation, high negative assortativity means users with high reputation mainly interact with users with low reputation 
    '''
    
    # import reputation data
    reputation_path = data_path+'reputations/'
    reputation_file_name = '%s_first_180_days_eng_reputation.csv'%(community)
    rep_data = pd.read_csv(reputation_path+reputation_file_name,index_col=0)
    
    
    # import interactions via questions, answers and comments for network:
    interactions_path = data_path+'interactions/'
    qa = pd.read_csv(interactions_path + '%s/%s_interactions_questions_answers.csv'%(community,community))
    comm = pd.read_csv(interactions_path + '%s/%s_interactions_comments.csv'%(community,community))
    acc = pd.read_csv(interactions_path + '%s/%s_interactions_acc_answers.csv'%(community,community))

    data = [] # list that will store all data about 30 day windows
    for t in range(180-twin):
        
        new_data_line = [t] 
        
        # interaction slices within [t,t+twin) windows
        qna_slice = qa[(qa['days']>=t)&(qa['days']<t+twin)].dropna().copy()#ast had nan's so dropna is included
        comm_slice = comm[(comm['days']>=t)&(comm['days']<t+twin)].dropna().copy()
        acc_slice = acc[(acc['days']>=t)&(acc['days']<t+twin)].dropna().copy()

        # full network of interactions
        fullnet = pd.concat([qna_slice[['PostUserId','RespondUserId']], comm_slice[['PostUserId','RespondUserId']], acc_slice[['PostUserId','RespondUserId']]])
        fullnet = fullnet[fullnet['PostUserId']!=fullnet['RespondUserId']] #drop selflinks
        network = nx.Graph()
        network = nx.from_pandas_edgelist(fullnet, source='RespondUserId', target='PostUserId')
        network = network.to_undirected()
        
        # network's mean clustering coef.
        clustering_coef = nx.average_clustering(network)
        new_data_line.append(clustering_coef)
                
        # dataframe with users' reputations at t+twin time step
        drdf = pd.DataFrame(rep_data[str(t+twin-1)]).reset_index() 
        drdf = drdf.rename(columns={str(t+twin-1):'dynrep'})
             
        # slice of dataframe containing only users that are within the interaction network
        nodelist = list(network.nodes())
        drdf = drdf[drdf['index'].isin(nodelist)]
        
        # add user's degree and betweenness_centrality to the df with reputations
        drdf['degree'] = drdf['index'].map(dict(nx.degree(network)))
        drdf['betweenness_centrality'] = drdf['index'].map(nx.betweenness_centrality(network))
        
        # correlations between node's dynamical reputation and node's degree and betweenness centrality
        dr_degree_corr = drdf['dynrep'].corr(drdf['degree'])
        dr_bc_corr = drdf['dynrep'].corr(drdf['betweenness_centrality'])
        
        # to calculate assortativity of dynamical reputation within the interaction network, 
        # we look at the correlation between dynamical reputations of nodes for each edge in the network
        # we construct dataframe where for each post user id, user's reputation is assinged, and similarly for all reposnd user ids
        reputations_on_network = pd.merge(left = fullnet, right=drdf,left_on='PostUserId',right_on='index').rename(columns={'dynrep':'PostUserRep'})
        reputations_on_network = pd.merge(left=reputations_on_network, right=drdf,left_on='RespondUserId',right_on='index').rename(columns={'dynrep':'RespondUserRep'})

        dr_assortativity = reputations_on_network['PostUserRep'].corr(reputations_on_network['RespondUserRep'])
        
        new_data_line = [t,clustering_coef,dr_degree_corr,dr_bc_corr,dr_assortativity]
        data.append(new_data_line)

        
    data_df = pd.DataFrame(data,columns=['First day','Clustering_coef','DynRep_Degree_corr','DynRep_BC_corr','DynRep_Assortativity'])
    
    if save_path:    
        filename = '%s_dyn_rep_and_networks_features.csv'%(community)
        os.makedirs(save_path, exist_ok = True)
        data_df.to_csv(save_path+filename,index=False)
        
    else:    
        return(data_df)

We run this function for all four pairs of communities:

In [9]:
communities = ['astronomy','economics','literature','physics','052012astronomy','052012economics','052012-literature','052012-theoretical-physics']
for comm in communities:
    if comm[0].isdigit():
        net = 'area51'
    else:
        net = 'launched'
    extract_network_and_dyn_reputation_features(net, comm, data_path+'processed data/',twin=30)

In [10]:
def core_periphery_with_dyn_rep(community,save_path,twin=30):
    '''
    This function gathers data on 30 day networks core and periphery labels and analyses that data joinly with data about users' dynamical reputation.
    Result is a dataframe per community with characteristics of core and periphery for the first 150 30day sliding windows.
    Inputs:
    - community: name of SE site, used to get appropriate input files that contain reputations and core and periphery nodes and links
    - save_path: path to directory where output dataframe should be saved
    - twin: time window within which interactions are aggregated, default is 30 days
    Outputs:
    - if save_path is not provided ('') dataframe is returned, otherwise it is saved
    DataFrame contains following columns
    - First day of twin sliding window
    - N_core - number of users in the network's core
    - Frac_core - fraction of users (in %) that belong to the core
    - LN_core - links per node within core
    - LN_core_periphery - total number of links between core and periphery divided by total number of nodes
    - LN_periphery - links per node within periphery
    - Mean_dr_core - mean dynamical reputation within core
    - DynRep_core_per_ratio - ratio between the total reputation within core and periphery
    '''  
    
    # import reputation data
    reputation_path = data_path+'reputations/'
    reputation_file_name = '%s_first_180_days_eng_reputation.csv'%(community)
    rep_data = pd.read_csv(reputation_path+reputation_file_name,index_col=0)
    
    # contains data about nodes within core and periphery as well as number of links within and between each group
    coreper_path = data_path+'core periphery/'
    f = h5py.File(coreper_path+'%s_window%s_core_per.hdf5'%(community,twin), 'r')
    
    data = [] # list that will store all data about 30 day windows
    for t in range(180-twin):
        # core and periphery node data
        corepernodes = pd.DataFrame(f['labels_%s-%sdays'%(t,t+twin)],columns=['nodes','cat'])
        
        # data about dynamical reputation for users at the end of the timewindow
        drdf = pd.DataFrame(rep_data[str(t+twin-1)]).reset_index()
        drdf = drdf.rename(columns={str(t+twin-1):'dynrep'})
        
        # lists of nodes that belong to core and periphery
        corelist = list(corepernodes[corepernodes['cat']==0]['nodes']) # list of nodes within core
        perlist = list(corepernodes[corepernodes['cat']==1]['nodes']) # list of nodes within periphery
        
        Ncore = len(corelist) # size of core
        Nper = len(perlist) # size of periphery
        fcore = 100*Ncore/(Ncore+Nper) # fraction of nodes that are within core

        # links versus nodes ratio within core
        links_core_core = f['ms_%s-%sdays'%(t,t+twin)][0,0]
        ln_ratio_core = links_core_core/Ncore
        
        # links versus nodes ratio between core and periphery
        links_core_periphery = f['ms_%s-%sdays'%(t,t+twin)][0,1]
        ln_ratio_core_periphery = links_core_periphery/(Ncore+Nper)
        
        # links versus nodes ratio within periphery
        links_per_per = f['ms_%s-%sdays'%(t,t+twin)][1,1]
        ln_ratio_periphery = links_per_per/Nper

        # mean dyn reputation within core
        mean_dr_core = drdf[drdf['index'].isin(corelist)].dynrep.mean()
        
        # ratio between total dynamical reputation within core and periphery 
        dr_core_periphery = drdf[drdf['index'].isin(corelist)].dynrep.sum()/drdf[drdf['index'].isin(perlist)].dynrep.sum()
        
        
        data.append([t,
                     Ncore, # core size measured as total number of nodes
                     fcore, # core size measured as % of nodes that belong to core
                     ln_ratio_core, # ratio between number of links and nodes within core
                     ln_ratio_core_periphery, # ratio between number of links and nodes between core and periphery 
                     ln_ratio_periphery, # ratio between number of links and nodes within periphery
                     mean_dr_core, # mean dyn rep within core
                     dr_core_periphery # ratio of dyn rep within core and periphery
                    ])

    data_df = pd.DataFrame(data,columns=['First day','N_core','Frac_core','LN_core','LN_core_periphery','LN_periphery','Mean_dr_core','DynRep_core_per_ratio'])
    
    if save_path:    
        filename = '%s_dyn_rep_and_core_periphery_features.csv'%(community)
        os.makedirs(save_path, exist_ok=True)
        data_df.to_csv(save_path+filename, index=False)
    else:    
        return(data_df)

We run this function for all four pairs of communities:

In [11]:
communities = ['astronomy','economics','literature','physics','052012astronomy','052012economics','052012-literature','052012-theoretical-physics']
for comm in communities:
    core_periphery_with_dyn_rep(comm, data_path+'processed data/',twin=30)

# data extraction for popular/casual users

In [12]:
def gather_data_per_user(qna,comm, acc):
    user_qs = qna[['QId','PostUserId']].groupby('QId').max().reset_index().groupby('PostUserId').size().to_frame('Q_numb').reset_index()
    user_qs = user_qs.rename(columns={'PostUserId':'UserId'})
    
    user_as = qna.groupby(['QId','RespondUserId']).size().reset_index().groupby('RespondUserId').size().to_frame('A_numb').reset_index()
    user_as = user_as.rename(columns = {'RespondUserId':'UserId'})

    user_cs = comm.groupby(['QId','PostUserId','RespondUserId']).size().reset_index().groupby('RespondUserId').size().to_frame('C_numb').reset_index()
    user_cs = user_cs.rename(columns = {'RespondUserId':'UserId'})
    
    user_acc = acc.groupby(['QId','PostUserId','RespondUserId']).size().reset_index().groupby('RespondUserId').size().to_frame('ACC_numb').reset_index()
    user_acc = user_acc.rename(columns = {'RespondUserId':'UserId'})

    #users = pd.merge(pd.merge(user_qs,user_as,on='UserId',how='outer'),user_cs,on='UserId',how='outer')
    users = pd.merge(pd.merge(pd.merge(user_qs,user_as,on='UserId',how='outer'),user_cs,on='UserId',how='outer'), user_acc, on='UserId', how='outer')

    users['total'] = users['Q_numb'].fillna(0)+users['A_numb'].fillna(0)+users['C_numb'].fillna(0)+users['ACC_numb'].fillna(0)
    
    return users

In [13]:
def slice_time_df(data, column_name, t, twin = 30):
    condition = (data[column_name]>=t)&(data[column_name]<t+twin)
    data_slice = data[condition].dropna().copy()
    return data_slice

In [14]:
def active_inactive_user_inetraction(qaT,commT,accT,q):
    #extract per user activity from interaction data
    users = gather_data_per_user(qaT,commT,accT)

    #list of top quantile q users
    topus = list(users[users['total']>users['total'].quantile(q)]['UserId']) # top users

    #join qna & comm interactions
    net_selection  = pd.concat([qaT[['PostUserId','RespondUserId']].dropna(),commT[['PostUserId','RespondUserId']].dropna()])

    #network construction
    G_full = nx.Graph()
    #omit self interactions
    G_full = nx.from_pandas_edgelist(net_selection[net_selection['PostUserId']!=net_selection['RespondUserId']], source = 'RespondUserId', target = 'PostUserId')

    G_popular = G_full.subgraph(topus)
    popular_nodes = len(G_popular.nodes())
    popular_edges = len(G_popular.edges())
    popular_lpn = popular_edges/popular_nodes
    G_casual = G_full.subgraph(set(G_full.nodes())-set(topus))
    casual_nodes = len(G_casual.nodes())
    casual_edges = len(G_casual.edges())
    casual_lpn = casual_edges/casual_nodes
    
    mix_edges = len(G_full.edges()) - casual_edges - popular_edges
    mix_lpn = mix_edges/len(G_full.nodes())

    return [popular_nodes,popular_edges,popular_lpn,casual_nodes,casual_edges,casual_lpn,mix_lpn]

In [15]:
def popular_casual_users(community,save_path,q=0.8,twin=30):
    """
    """
    data_address = data_path+'interactions/'
    qa = pd.read_csv(data_address+'%s/%s_interactions_questions_answers.csv'%(community,community))
    comm = pd.read_csv(data_address+'%s/%s_interactions_comments.csv'%(community,community))
    acc = pd.read_csv(data_address + '%s/%s_interactions_acc_answers.csv'%(community,community))

    temp_data = []
    columns = ['t0','popular_nodes','popular_edges','popular_lpn','casual_nodes','casual_edges','casual_lpn','mix_lpn']
    for t in range(180-twin):
    #t = 0
        qnaT = slice_time_df(qa,'days',t,twin)
        commT = slice_time_df(comm,'days',t,twin)
        accT = slice_time_df(acc, 'days', t, twin)

        temp_data.append([t]+active_inactive_user_inetraction(qnaT,commT,accT,q))
    
    data_df = pd.DataFrame(temp_data,columns=columns)

    if save_path:    
        filename = '%s_popular_casual_users.csv'%(community)
        os.makedirs(save_path, exist_ok = True)
        data_df.to_csv(save_path+filename,index=False)
    else:    
        return(data_df)

In [16]:
communities = ['astronomy','economics','literature','physics','052012astronomy','052012economics','052012-literature','052012-theoretical-physics']
for comm in communities:
    popular_casual_users(comm, data_path+'processed data/propopular_users/')

## RMSE between number of users in sliding window of 30 days and number of users with reputation higher than 1 

In [17]:
from data_processing_functions import Nusers_sw, prepare_data
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import os

def get_dataframe(community):
                  
    window = 30
    betas = [0.94, 0.941, 0.942, 0.943, 0.944, 0.945, 0.946, 0.947, 0.948, 0.949,
             0.95, 0.951, 0.952, 0.953, 0.954, 0.955, 0.956, 
             0.957, 0.958, 0.959, 0.96, 0.961, 0.962, 0.963, 0.964, 
             0.965, 0.966, 0.967, 0.968, 0.967, 0.968, 0.969, 0.97]

    file_path = data_path+"reputations/"

    error = []
    for b in betas:

        #get number of users with reputation higher than 1
        file_name = file_path+"%s_first_180_days_eng_reputation_beta%s.csv"%(community, b)
        dr = pd.read_csv(file_name, index_col=1)

        y = (dr>1).sum()
        y_actual = np.array(y[window:])

        #get number of users in slidng window
        interactions_path = data_path+"interactions/%s/"%community
        U = Nusers_sw(prepare_data( community, 0, 180, 'eng', interactions_path), 180, 30) # data, end time, window
        y_predicted = np.array(U[1])

        RMSE = math.sqrt(mean_squared_error(y_actual, y_predicted))

        error.append((b, RMSE))

    dataframe = pd.DataFrame(np.array(error), columns=["Beta", "RMSE"])
    
    
    return dataframe
    

for community in ['physics', 'astronomy', 'economics', 'literature', '052012-theoretical-physics', '052012astronomy', '052012economics', '052012-literature']:
    dataframe = get_dataframe(community)
    
    path = data_path+"processed data/RMSE/"
    os.makedirs(path, exist_ok=True)
    dataframe.to_csv(path+"rmse_%s.csv"%community)

## get time interaval between first and last user activity

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
from data_processing_functions import get_first_last_activity
import os

def cal_user_time( name, ltlim, htlim):
    interaction_path = data_path+'interactions/%s/'%name
    df = get_first_last_activity(name, ltlim, htlim, interaction_path)
    path = data_path+"processed data/users_activity/"
    os.makedirs(path, exist_ok=True)

    df.to_csv(path+'%s_%s_%s.csv'%(name, ltlim, htlim), index=None)
    
communities = ['astronomy','economics','literature','physics','052012astronomy','052012economics','052012-literature','052012-theoretical-physics']
for comm in communities:
    cal_user_time(comm, 0, 180)

## Minimum description length, number of nodes in core, normalized mutual information, adjusted rand index, F1 measure and Jaccard index, among 50 samples for 30-days sub-networks

In [19]:
import pandas as pd
import numpy as np
import h5py

from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score

def get_labels(ltlim, htlim, data):
    #data = h5py.File('results_ensemble_test/052012astronomy/052012astronomy_%s-%sdays.hdf5'%(ltlim, htlim), 'r')
    total_core = {}
    total_mdl = {}
    for j in range(0, 50):

        labels = np.array(data['labels_%s'%j])
        mdl  = np.array(data['mdl_%s'%j])
                
        total_core[j] = labels
        total_mdl[j] = mdl
        
    return total_core, total_mdl

def select_core(ls):
    core = []
    for i in ls:
        if i[1]==0.:
            core.append(i[0])
    return core

import os

name = '052012astronomy'
twin=30

adj_time = {}
nmi_time = {}
f1_time = {}
jacc_time = {}

for i in range(180-twin):
    ltlim = i
    htlim = i + twin
    path = data_path+'core periphery/core-periphery_ens/052012astronomy/'
    data = h5py.File(path+'%s_%s-%sdays.hdf5'%(name, ltlim, htlim), 'r')

    labels_sample, mdl_sample = get_labels(ltlim,htlim, data)

    adj = []
    nmi = []
    f1 = []
    jacc = []
    for itt1 in range(len(labels_sample)):

        for itt2 in range(itt1, len(labels_sample)-1):

            if itt1!=itt2:

                sample1 = [ int(i[1]) for i in labels_sample[itt1]]
                sample2 = [ int(i[1]) for i in labels_sample[itt2]]

                adj.append(adjusted_rand_score(sample1, sample2))
                nmi.append(normalized_mutual_info_score(sample1, sample2))
                f1.append(f1_score(sample1, sample2, average='macro'))
                jacc.append(jaccard_score(sample1, sample2, average='macro'))
                         
    adj_time[i] = adj
    nmi_time[i] = nmi
    f1_time[i] = f1
    jacc_time[i] = jacc


    
res_path = data_path+"processed data/core-periphery_ens_statistics/"
os.makedirs(res_path, exist_ok=True)
pd.DataFrame(adj_time).to_csv(res_path+'/%s_ari.csv'%( name))
pd.DataFrame(nmi_time).to_csv(res_path+'%s_nmi.csv'%( name))
pd.DataFrame(f1_time).to_csv(res_path+'/%s_f1.csv'%( name))
pd.DataFrame(jacc_time).to_csv(res_path+'%s_jacc.csv'%( name))



mdl_time = {}
node_time = {}

for i in range(180-twin):
    ltlim = i
    htlim = i + twin
    path = data_path+'core periphery/core-periphery_ens/052012astronomy/'
    data = h5py.File(path+'/%s_%s-%sdays.hdf5'%(name, ltlim, htlim), 'r')
    labels_sample, mdl_sample = get_labels(ltlim,htlim, data)
    mdl = []
    nodes = []
    for key, val in mdl_sample.items():
        mdl.append(float(val))
    mdl_time[i] = mdl
    
    for _, lab in labels_sample.items():
        
        core = select_core(lab)
        nodes.append(len(core))
            
    node_time[i]= nodes
    
pd.DataFrame(node_time).to_csv(res_path+'/%s_nodes_in_core.csv'%(name))
pd.DataFrame(mdl_time).to_csv(res_path+'/%s_mdl.csv'%(name))

## jaccard index between core users in sub-networks at time points t1 and t2

In [20]:
import seaborn as sns
import h5py
import numpy as np
def import_core(name, steps):
    
    fname = data_path+'core periphery/%s_window30_core_per.hdf5'%(name)
    time0=0
    time1=30
    data = []
    with h5py.File(fname, 'r') as W:
        for r in range(steps):
            ltlim = time0 + r
            htlim = time1 + r
           
            if 'ns_%s-%sdays'%(ltlim, htlim) in W.keys():
                labels = np.array(W['labels_%s-%sdays'%(ltlim, htlim)]) #= np.array(results[r][1])
                data.append(labels)
    return data

#define Jaccard Similarity function
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def calculate_jaccard(labels):
    #steps= 150
    twin=30

    core_list = []
    for i in labels:
        core_list.append( [ line[0] for line in i if line[1]==0])

    Z = []
    for i in range(0, 180-twin):
        for j in range(0, 180-twin):
            Z.append((i, j, jaccard(core_list[i], core_list[j])))
            
    return Z



def dataframe(name):
    twin=30
    labels = import_core(name, 180-twin)
    Z = calculate_jaccard(labels)
    lag_j = []
    for i in Z:
        lag_j.append( (np.abs( i[1]-i[0]), i[2]))
    df = pd.DataFrame(lag_j, columns=['time-delta', 'jaccard'])
    df['name'] = name
    return df

for community in ["astronomy", "052012astronomy", 
                 "economics", "052012economics", 
                 "physics", "052012-theoretical-physics", 
                 "literature", "052012-literature"]:
    
    df = dataframe(community)
    path = data_path+"processed data/jaccard/"
    os.makedirs(path, exist_ok=True)
    
    df.to_csv(path+"%s_jaccard_time_delta.csv"%community)

## Jaccard index between core users in sub-networks at time points t1 and t2

In [21]:
for community in ['052012astronomy',  '052012economics',  '052012-literature',  '052012-theoretical-physics',
             'astronomy', 'economics', 'literature',   'physics',  
            ]:
    twin=30
    labels = import_core(community, 180-twin)

    core_list = []
    for i in labels:
        core_list.append( [ line[0] for line in i if line[1]==0])

    Z = []
    for i in range(0, 180-twin):
        for j in range(0, 180-twin):
            Z.append((i, j, jaccard(core_list[i], core_list[j])))

    Z = np.array(Z)
    df = pd.DataFrame(Z, columns = ['t1','t2', 'J'])
    path = data_path+"processed data/jaccard/"
    os.makedirs(path, exist_ok=True)
    df.to_csv(path+"%s_jaccard.csv"%community)

## network and core-periphery properties for different sliding windows

In [22]:
import pandas as pd
import networkx as nx
import h5py

def calculate_fig14_csv(community, twin=30):
    
    data = {"First_day":[],
            "N_nodes":[],
            "L/N":[],
            "clustering":[],
            
            "L/N_core": [],
            "N_core": [],
            "L/N_core_per":[],
            "mean_core_dr":[],
            "mean_core/per_dr":[],
    }
    
    reputation_path = data_path+'reputations/'
    reputation_file_name = '%s_first_180_days_eng_reputation.csv'%(community)
    rep_data = pd.read_csv(reputation_path+reputation_file_name,index_col=0)
    
    # import interactions via questions, answers and comments for network:
    interactions_path = data_path+'interactions/'
    qa = pd.read_csv(interactions_path + '%s/%s_interactions_questions_answers.csv'%(community,community))
    comm = pd.read_csv(interactions_path + '%s/%s_interactions_comments.csv'%(community,community))
    acc = pd.read_csv(interactions_path + '%s/%s_interactions_acc_answers.csv'%(community,community))
    
    # contains data about nodes within core and periphery as well as number of links within and between each group
    coreper_path = data_path+'core periphery/'
    f = h5py.File(coreper_path+'%s_window%s_core_per.hdf5'%(community, twin), 'r')
    
    for t in range(180-twin):
        
        qna_slice = qa[(qa['days']>=t)&(qa['days']<t+twin)].dropna().copy()#ast had nan's so dropna is included
        comm_slice = comm[(comm['days']>=t)&(comm['days']<t+twin)].dropna().copy()
        acc_slice = acc[(acc['days']>=t)&(acc['days']<t+twin)].dropna().copy()

        # full network of interactions
        fullnet = pd.concat([qna_slice[['PostUserId','RespondUserId']], comm_slice[['PostUserId','RespondUserId']], acc_slice[['PostUserId','RespondUserId']]])
        fullnet = fullnet[fullnet['PostUserId']!=fullnet['RespondUserId']] #drop selflinks
        network = nx.Graph()
        network = nx.from_pandas_edgelist(fullnet, source='RespondUserId', target='PostUserId')
        G = network.to_undirected()
        
        data["First_day"].append(t)
        
        N = G.number_of_nodes()
        L = G.number_of_edges()
        if N!=0:
            dens = L/N
            c = nx.average_clustering(G)
        else:
            c = 0
            dens = 0
            
        data["N_nodes"].append(N)
        data["L/N"].append(dens)
        data["clustering"].append(c)
        
        
        # core and periphery node data
        corepernodes = pd.DataFrame(f['labels_%s-%sdays'%(t,t+twin)],columns=['nodes','cat'])
        
        # data about dynamical reputation for users at the end of the timewindow
        drdf = pd.DataFrame(rep_data[str(t+twin-1)]).reset_index()
        drdf = drdf.rename(columns={str(t+twin-1):'dynrep'})
        
        # lists of nodes that belong to core and periphery
        corelist = list(corepernodes[corepernodes['cat']==0]['nodes']) # list of nodes within core
        perlist = list(corepernodes[corepernodes['cat']==1]['nodes']) # list of nodes within periphery
        
       
        
        Ncore = len(corelist) # size of core
        Nper = len(perlist) # size of periphery
        
        if Ncore==0:
            fcore =0
            ln_ratio_core=0
        else:
            fcore = 100*Ncore/(Ncore+Nper) # fraction of nodes that are within core

            # links versus nodes ratio within core
        
            
            links_core_core = f['ms_%s-%sdays'%(t,t+twin)][0,0]
            ln_ratio_core = links_core_core/Ncore
            
        
        # links versus nodes ratio between core and periphery
        links_core_periphery = f['ms_%s-%sdays'%(t,t+twin)][0,1]
        ln_ratio_core_periphery = links_core_periphery/(Ncore+Nper)
        
        # links versus nodes ratio within periphery
        links_per_per = f['ms_%s-%sdays'%(t,t+twin)][1,1]
        ln_ratio_periphery = links_per_per/Nper

        # mean dyn reputation within core
        mean_dr_core = drdf[drdf['index'].isin(corelist)].dynrep.mean()
        
        # ratio between total dynamical reputation within core and periphery 
        dr_core_periphery = drdf[drdf['index'].isin(corelist)].dynrep.mean()/drdf[drdf['index'].isin(perlist)].dynrep.mean()
        
        
        data["N_core"].append(Ncore)
        data["L/N_core"].append(ln_ratio_core)
        data["L/N_core_per"].append(ln_ratio_core_periphery)
        data["mean_core_dr"].append(mean_dr_core)
        data["mean_core/per_dr"].append(dr_core_periphery)
        
        
    return data

import os
for window in [10, 30, 60]:
    
    for community in ["astronomy", "052012astronomy"]:
        data = calculate_fig14_csv(community, twin=window)
        df = pd.DataFrame(data)
        path = data_path+"processed data/network_properties_window/"
        os.makedirs(path, exist_ok=True)
        
        file_name = path+"%s_network_properties_window%s.csv"%(community, window)
        df.to_csv(file_name, index=None)   