In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

### Sklearn

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

### CV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

def train_lr(X_train, X_test, y_train, y_test):
    model = LogisticRegression(random_state=42, max_iter=10000)
    params = [{'C': [1, 10, 100]}]
    model_cv = GridSearchCV(model, params, cv=3, )
    model_cv.fit(X_train, y_train)
    print(f'The trained model achievs test acc of: {accuracy_score(y_test, model_cv.predict(X_test)):.2f}')
    return model_cv

### Misc

In [None]:
from collections import Counter
from functools import partial 

def bootstrap_CI(data, nbr_draws, interval):

    means = np.zeros(nbr_draws)
    data = np.array(data)

    for n in range(nbr_draws):
        indices = np.random.randint(0, len(data), len(data))
        data_tmp = data[indices] 
        means[n] = np.nanmean(data_tmp)

    margin = (100 - interval) / 2
    return [np.nanpercentile(means, margin),np.nanpercentile(means, 100 - margin)]

: 

## Networx

In [None]:
import networkx as nx

In [None]:
# creating a graph

edge_list = ...
G = nx.DiGraph()
G.add_edges_from(edge_list)

g_tags_adj = np.loadtxt(open('data/g_tags_adj.csv', 'r'), delimiter=',', skiprows=0)
g_tags = nx.from_numpy_array(g_tags_adj)

In [None]:
# properties

nodes = G.nodes()
edges = G.edges()

node_id = 1
#in/out degree for directed
G.in_degree(node_id)
# for undirected
G.degree(node_id)

nx.diameter(G)

#connectivity 
# for directed
nx.is_weakly_connected(G)
nx.is_strongly_connected(G)
# for undirected
nx.is_connected(G)

sub_nodes = nodes[:10]
subgraph = G.subgraph(sub_nodes)

index_to_cat = {1: 'cat1', 2: 'cat2'}
nx.set_node_attributes(G, index_to_cat, "category")


centralities = nx.eigenvector_centrality(G) # nx.degree_centrality(G), nx.betweenness_centrality(G)

In [None]:
# plotting

colors = {'Howto & Style': 'pink', 'Gaming': 'green'}

list_nodes =list(G.nodes())
list_nodes.reverse()   # for showing the nodes with high betweeness centrality 
pos = nx.spring_layout(G)
ec = nx.draw_networkx_edges(G, pos, alpha=0.1)
nc = nx.draw_networkx_nodes(G, pos, nodelist=list_nodes, node_color=[colors[G.nodes[n]["category"]] for n in list_nodes], 
                            alpha=0.8, node_shape = '.')
plt.colorbar(nc)
plt.axis('off')
plt.show()


def plot_degree_distribution(G, type):
    degrees = [G.in_degree(node) if type == 'in_degree' else G.out_degree(node) for node in G.nodes()]
    plt.figure(figsize=(15, 7))
    plt.hist(degrees, bins=50)
    plt.title("Degree Distribution")
    plt.xscale('log')
    plt.yscale('log')
    plt.ylabel("Frequency")
    plt.xlabel("Degree")

### Stat tests

In [None]:
from scipy import stats

In [None]:
# assuming the data is distributed normally, to compare the means of the two samples
alpha = 0.05

df = ...
stat, p_val = stats.ttest_ind(df['A'], df['B'], equal_var=False, alternative='less')

if p_val < alpha:
      print(f'Based on the independent t-test the distribution of the regular attack of A is different from ', 
            f'the distribution of the regular attack of B', sep='')
else:
      print('Based on the independent t-test the distribution the samples come from the same distribution')

### Plotting

In [None]:
df = ...

sns.barplot(
    data=df, x="high_in_degree", y="finished", estimator='mean',
    errorbar=("ci", 95), capsize=.01, linewidth=0.1,
)

### Causality


**Answering C2.2** 
In C1, we are not measuring the direct effect, but the total effect, including the effect mediated through the shortest path length. 

Possible answers: <br>
a) C1 is a very naive analysis, could come up with any confounder that could reverse the effect. <br>
b) Depending on on the strength and the sign of the mediated impact, since we are measuring the total impact, we could see different results compared to the true direct causal effect.

After matching on source page and shortest path, games with high in-degree source are 14.74% more likely to be finished
2. **C3.3:** These differences are smaller compared to how they were before matching, meaning that a lot of the difference can be explained with the mediation through source and the shortest path. However, the direct effect of target in-degree is still significant.

In [None]:
# Exact matching

game_groups = df.groupby(by=['source', 'shortest_path_length'])

matched_groups = []
pairs_matched = 0
for _, group in game_groups:
    high_in_degree_group = group[group['high_in_degree'] == True]
    low_in_degree_group = group[group['high_in_degree'] == False]
    match_size = min(len(high_in_degree_group), len(low_in_degree_group))
    pairs_matched += match_size
    matched_groups.append(high_in_degree_group.sample(match_size))
    matched_groups.append(low_in_degree_group.sample(match_size))

games_matched = pd.concat(matched_groups)

print(f'Pairs matched: {pairs_matched}, data points discarded: {len(df) - len(games_matched)}')