# Network Analysis
### The next section contains the network analysis of programmers interacting with other programmers on the basis of having a problem with a given programming language.
#### The following steps are taken in this part of the Stack Overflow investigation:

1. Create individual networks for each programming language with authors as nodes and interactions (answering questions or commenting answers) as links.

2. Perform basic network analysis (counts, degree distribution etc.) on language-specific networks.

3. Create one big StackOverflow-network including all 16 programming languages with same types of nodes and links.

4. Perform basic network analysis on the StackOverflow-network.

5. Vizualize the StackOverflow-network.

6. Perform advanced network analysis on the StackOverflow-network by investigating the different subnetworks, communities, modularity etc.

7. Use the Louvain algorithm to create a network from the data and compare to the StackOverflow-network.
 

In [None]:
### Imports
import pandas as pd
import numpy as np
import networkx as nx
import netwulf as nf
from scipy import stats 
from operator import itemgetter 
from collections import Counter
import re
from glob import glob as glob  # glob
from tqdm import tqdm
from pelutils import Table, thousand_seps
import itertools

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

def setup_mpl():
    mpl.rcParams['font.family'] = "Liberation Serif"
    mpl.rcParams['font.size'] = 11
    mpl.rcParams['figure.figsize'] = (7,2.5)
    mpl.rcParams['figure.dpi'] = 200
    #mpl.rcParams['lines.linewidth'] = 1
setup_mpl()


In [None]:
loved_languages = {
    "rust":        86.1,
    "typescript":  67.1,
    "python":      66.7,
    "kotlin":      62.9,
    "go":          62.3,
    "dart":        62.1,
    "c#":          59.7,
    "javascript":  58.3,
    "haskell":     51.7,
    "java":        44.1,
    "c++":         43.4,
    "ruby":        42.9,
    "c":           33.1,
    "perl":        28.6,
    "objective-c": 23.4,
    "vba":         19.6,
}
data = pd.read_pickle('/home/augustsemrau/drive/6semester/CSS_02467/css-project/data_timesorted.pkl')

In [None]:
so = data#[:50000]
# Data summary
t = Table()
t.add_row(["Language", "Questions", "Answers", "Comments", "Total"])
for lang in tqdm(loved_languages):
    t.add_row([
        lang.capitalize(),
        *[thousand_seps(sum((so["language"] == lang) & (so["type"] == t))) for t in ("q", "a", "c")],
        thousand_seps(sum(so["language"] == lang)),
    ], [1, 0, 0, 0, 0])
t.add_row([
    "",
    thousand_seps(sum(so["type"] == "q")),
    thousand_seps(sum(so["type"] == "a")),
    thousand_seps(sum(so["type"] == "c")),
    thousand_seps(len(so)),
], [1, 0, 0, 0, 0])
print("Number of items in dataset")
print(t)

1. Create individual networks for each programming language with authors as nodes and interactions (answering questions or commenting answers) as links.

In [None]:
### Function for creating programming-language-specific networks
def lang_networks(data, prog_language):
    lang_data = data.loc[data['language'] == prog_language]
    print(lang_data.shape)
    lang_questions = lang_data.loc[lang_data['type'] == 'q']
    lang_answers = lang_data.loc[lang_data['type'] == 'a']
    lang_comments = lang_data.loc[lang_data['type'] == 'c']

    ## Get authors of questions and answers
    question_authors = dict(zip(lang_questions['question_id'], lang_questions['owner/user_id']))
    answer_question_authors = dict(zip(lang_answers['answer_id'], lang_answers['owner/user_id']))

    ## Find parent id's of all datapoints
    lang_data['parent_author'] = ""
    for index, row in tqdm(lang_data.iterrows()):
        if row['type'] == 'q':
            row['parent_author'] = None
        elif row['type'] == 'a':
            try:
                row['parent_author'] = question_authors[str(row['question_id'])]
            except:
                row['parent_author'] = None
        else:
            try:
                row['parent_author'] = answer_authors[str(row['answer_id'])]
            except:
                row['parent_author'] = None

    ## Filter for NANs, there are a lot for comments..
    print(f'{language} data before filtering for NANs', lang_data.shape)
    lang_data = lang_data[lang_data['parent_author'].notnull()]
    lang_data = lang_data[lang_data['owner/user_id'].notnull()]
    lang_data = lang_data[lang_data['parent_author'] != 'None']
    lang_data = lang_data[lang_data['owner/user_id'] != 'None']
    print(f'{language} data after filtering for NANs', lang_data.shape)

    ## Create weighted edge list
    edge_list = lang_data.groupby(['owner/user_id', 'parent_author']).size().to_frame('weight').reset_index()

    ## Define nodes and weights
    sources = list(edge_list['owner/user_id'])
    targets = list(edge_list['parent_author'])
    weights = list(edge_list['weight'])
    
    ## Create tuple list of edges with weights
    edges = [(sources[i], targets[i], weights[i]) for i in range(len(sources))]

    ## Create graph and add nodes and edges
    lang_graph = nx.DiGraph()
    lang_graph.add_nodes_from(sources)
    lang_graph.add_nodes_from(targets)
    lang_graph.add_weighted_edges_from(edges)

    return lang_graph


In [None]:
### This cell creates the graphs for each programming language and saves a pickle
for prog_lang in graphs.keys():
    print(prog_lang)
    prog_lang_graph = lang_networks(data=data, prog_language=prog_lang)
    ## Save to pickle for later use
    path = '/home/augustsemrau/drive/6semester/CSS_02467/css-project/data/graphs/' + str(prog_lang) + '_graph.pkl'
    nx.write_gpickle(prog_lang_graph, path)

In [None]:
### This cell loads all the pickled graphs
graphs = {prog_lang : None for prog_lang in loved_languages.keys()}

for prog_lang in graphs.keys():
    print(prog_lang)
    prog_lang_graph_path = '/home/augustsemrau/drive/6semester/CSS_02467/css-project/data/graphs/' + str(prog_lang) + '_graph.pkl'
    graphs[prog_lang] = nx.read_gpickle(prog_lang_graph_path)

2. Perform basic network analysis (counts, degree distribution etc.) on language-specific networks.

In [None]:
## Function for printing basic analysis
def basic_graph_analysis(graph, language):

    ## Number of nodes, links and density
    num_nodes = graph.number_of_nodes()
    num_links = graph.number_of_edges()
    density = nx.classes.function.density(graph)

    print(f"The number of nodes in the {language} graph: ", num_nodes)
    print(f"The number of links in the {language} graph: ", num_links)
    print(f"The density of the {language} graph: ", density)

    ## Average, median, mode, minimum and maximum value of the in and out-degrees
    in_degrees_dict = dict(graph.in_degree())
    out_degrees_dict = dict(graph.out_degree())

    in_degrees = list(in_degrees_dict.values())
    out_degrees = list(out_degrees_dict.values())

    print(f"In-degree of {language} users in the graph: ")
    print("  Average:", np.mean(in_degrees))
    print("  Median: ", np.median(in_degrees))
    print("  Mode:    {0} with {1} occurences.".format(int(stats.mode(in_degrees)[0]), int(stats.mode(in_degrees)[1])))
    print("  Minimum:", min(in_degrees))
    print("  Maximum:", max(in_degrees))
    print("")

    print(f"Out-degree of {language} users in the graph: ")
    print("  Average:", np.mean(out_degrees))
    print("  Median: ", np.median(out_degrees))
    print("  Mode:    {0} with {1} occurences.".format(int(stats.mode(out_degrees)[0]), int(stats.mode(out_degrees)[1])))
    print("  Minimum:", min(out_degrees))
    print("  Maximum:", max(out_degrees))

    ## Plot of the distribution of in-degrees and out-degrees, using a logarithmic binning

    # Compute histogram
    bins = np.logspace(0, np.log10(max(in_degrees)), 15)
    hist, edges = np.histogram(in_degrees, bins=bins, density=True)
    x_in = (edges[1:] + edges[:-1]) / 2.

    bins = np.logspace(0, np.log10(max(out_degrees)), 15)
    hist, edges = np.histogram(out_degrees, bins=bins, density=True)
    x_out = (edges[1:] + edges[:-1]) / 2.

    # Both in and out-degrees plot
    fig, ax = plt.subplots()
    ax.plot(x_in, hist, marker='.', label='In-Degrees')
    ax.plot(x_out, hist, marker='.', label='Out-Degrees')
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlabel('Number of degrees')
    ax.set_ylabel('Probability Density')
    ax.set_title(f"Logarithmically binned distribution plot of the {language}-graph")
    ax.grid()
    ax.legend(loc='upper right')

In [None]:
basic_graph_analysis(haskell_graph, 'haskell')

3. Create one big StackOverflow-network including all 16 programming languages with same types of nodes and links.

4. Perform basic network analysis on the StackOverflow-network.

5. Vizualize the StackOverflow-network.

In [None]:
nf.visualize(so_graph)

6. Perform advanced network analysis on the StackOverflow-network by investigating the different subnetworks, communities, modularity etc.

7. Use the Louvain algorithm to create a network from the data and compare to the StackOverflow-network.
