In [24]:
import os
import networkx as nx
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
import pickle
import time

In [25]:
def load_dfs(file_names, columns):
    
    df_a2q_raw = pd.read_csv(file_names[0],
                             delim_whitespace=True,
                             names=columns[0])

    df_c2q_raw = pd.read_csv(file_names[1],
                             delim_whitespace=True,
                             names=columns[1])

    df_c2a_raw = pd.read_csv(file_names[2],
                             delim_whitespace=True,
                             names=columns[2])
    
    dfs_raw = [df_a2q_raw, df_c2q_raw, df_c2a_raw]

    # add a column for a standard visualization of dates
    for df in dfs_raw:
        df["time_h"] = pd.to_datetime(df["time_u"], unit="s")
    
    names = ["a2q", "c2q", "c2a"]
    for i, df in enumerate(dfs_raw):
        most_recent = df["time_h"].max()
        name = names[i]
        print(f"Most recent date for dataset {name}: {most_recent}")
        
    threshold = datetime(year=2014, month=1, day=1, hour=0, minute=0, second=0)
    print(threshold)
    
    df_a2q = df_a2q_raw[df_a2q_raw["time_h"] > threshold].copy()
    df_c2q = df_c2q_raw[df_c2q_raw["time_h"] > threshold].copy()
    df_c2a = df_c2a_raw[df_c2a_raw["time_h"] > threshold].copy()
    
    dfs = [df_a2q, df_c2q, df_c2a]
    for df in dfs:
        min_el = df["time_u"].min()
        max_el = df["time_u"].max()
        df["weight"] = (df["time_u"] - min_el) / (max_el - min_el)
    
    return df_a2q, df_c2q, df_c2a

In [26]:
file_names = ["files/sx-stackoverflow-a2q.txt",
              "files/sx-stackoverflow-c2q.txt",
              "files/sx-stackoverflow-c2a.txt"]

columns = [["user_answering", 
            "user_questioning", 
            "time_u"],
           ["user_commenting", 
            "user_questioning", 
            "time_u"],
           ["user_commenting", 
            "user_answering", 
            "time_u"]]

names = ["a2q", "c2q", "c2a"]

In [27]:
df_a2q, df_c2q, df_c2a = load_dfs(file_names, columns)

Most recent date for dataset a2q: 2016-03-06 12:18:13
Most recent date for dataset c2q: 2016-03-06 14:10:28
Most recent date for dataset c2a: 2016-03-06 14:10:20
2014-01-01 00:00:00


In [28]:
df_a2q.head()

Unnamed: 0,user_answering,user_questioning,time_u,time_h,weight
10861428,95190,960750,1388534405,2014-01-01 00:00:05,0.0
10861429,607314,2635650,1388534412,2014-01-01 00:00:12,1.018444e-07
10861430,301857,1549201,1388534416,2014-01-01 00:00:16,1.600412e-07
10861431,1419954,1897577,1388534421,2014-01-01 00:00:21,2.327872e-07
10861432,254252,3065375,1388534423,2014-01-01 00:00:23,2.618857e-07


In [29]:
df_c2q.head()

Unnamed: 0,user_commenting,user_questioning,time_u,time_h,weight
9576868,976391,3061211,1388534408,2014-01-01 00:00:08,0.0
9576869,445131,2489834,1388534424,2014-01-01 00:00:24,2.327644e-07
9576870,1832636,2721870,1388534427,2014-01-01 00:00:27,2.764078e-07
9576871,822711,3136232,1388534428,2014-01-01 00:00:28,2.909556e-07
9576872,22656,1693074,1388534434,2014-01-01 00:00:34,3.782422e-07


In [30]:
df_c2a.head()

Unnamed: 0,user_commenting,user_answering,time_u,time_h,weight
14958295,22656,22656,1388534408,2014-01-01 00:00:08,0.0
14958296,2224701,23385,1388534414,2014-01-01 00:00:14,8.728668e-08
14958297,2732801,2732801,1388534421,2014-01-01 00:00:21,1.891211e-07
14958298,1965449,2325987,1388534422,2014-01-01 00:00:22,2.036689e-07
14958299,26742,26742,1388534423,2014-01-01 00:00:23,2.182167e-07


In [13]:
def time_wrapper(func):
    """
    Decorator to time functions, probably we won't need it in the final version, but for now I'm leaving it 
    in case you need it too. Just put @time_wrapper above any function you want to time and it will print
    the time in seconds it took to run.
    """
    def wrapped(*args, **kwargs):
        start = time.time()
        r = func(*args, **kwargs)
        print(f"Time elapsed: {time.time()-start}")
        return r
    return wrapped


class Graph:
    
    def __init__(self):
        self.graph = {}

    
    @time_wrapper
    def create_graph(self, sources, names):
        """
        Args:
            sources: dataframes from which to create the graph
            names: a list of string containing the names of the graph used in order to label the edges accordingly
        
        Summary:
            Each node in the graph acts as a key in the dictionary self.graph. Each entry in the dictionary is itself
            another dictionary, indexed by that node's neighbors and pointing to a list with the details of that edge,
            specifically [type_of_interaction, weight, date_human_format].
            So to make things clearer:
            
            self.graph -> Our entire graph, indexed by node
            self.graph[node "A"] -> dictionary indexed by the neighbors of "A"
            self.graph[node "A"][node "B"] -> details (specifically type of interaction, edge weight and datetime) 
                                                        of the interaction between users "A" and "B" of type a2q.
        """
        
        self.graph = {}
    
        for i_source, source in enumerate(sources):
            nodes_column = source.iloc[:, 0]
            nodes = nodes_column.unique()
            type_of_interaction = names[i_source]
            
            for node in tqdm(nodes):
                if node not in self.graph.keys():
                    self.graph[node] = {}
                node_subdf = source[nodes_column == node]
                neighbors = node_subdf.iloc[:, 1].values
                times_h = node_subdf.iloc[:, 3].values
                weights = node_subdf.iloc[:, 4].values
                for i, neighbor in enumerate(neighbors):
                    if neighbor not in self.graph[node].keys():
                        self.graph[node][neighbor] = []
                    self.graph[node][neighbor].append((type_of_interaction, weights[i], times_h[i]))
                
                
    @time_wrapper
    def save_graph(self, path):
        """
        Args:
            path: name of file where we want the file stored
        Summary:
            Saves graph as a binary file.
        """
        
        with open(path, "wb") as file:
            pickle.dump(self.graph, file)

    
    @time_wrapper
    def load_graph(self, path):
        """
        Args:
            path: name of file to load the graph from.
        Summary:
            Load graph from binary file.
        """
        
        with open(path, "rb") as file:
            self.graph = pickle.load(file)
       
    
    @time_wrapper
    def get_neighbors(self, node, interaction="all"):
        """
        Args:
            node: the node we want the neighbors of.
            interaction: the label of the edges we're interested in, default=all, meaning we will obtain all
                         neighbors of that node, regardless of the type of interactions.
        Summary:
            Obtains all the neighbors of a specific node.
        """
        
        if interaction == "all":
            neighbors = list(self.graph[node].keys())
        else:
            neighbors = []
            for neigbhor in self.graph[node].keys():
                if self.graph[node][neighbor][0] == interaction:
                    neighbors.append(neighbor)
        
        return neighbors
        

In [15]:
# Before running this cell, make sure to have the pickle file in a "files" folder, or change the path of the if 
# condition accordingly. This will take some time: on my pc it takes approximately 7 minutes to load the graph.
# The timing decorator will tell you how much it took, so you can plan accordingly.

g = Graph()

if "merged_graph.p" in os.listdir("files"):
    g.load_graph("files/merged_graph.p")
else:
    dfs = [df_a2q, df_c2q, df_c2a]
    g.create_graph(dfs, names=names)
    g.save_graph("files/merged_graph.p")

Time elapsed: 433.87333822250366
