# DataProfiler + Synthetic Graph Data Example

In [26]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
import pprint

from dataprofiler import Data, Profiler

from synthetic_data.graph_synthetic_data import GraphDataGenerator

The next code block creates a dataset of data following known distributions (for testing purposes). The data file is `demo_graph.csv`. No need to run this unless you want different data.

In [4]:
'''
Create graph CSV (no need to run)
'''
with open("demo_graph.csv", 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quotechar='"')
    writer.writerow(["node_id_dst, node_id_src, continuous_weight, categorical_status".strip("\"")])

    # distributions for edge attributes
    distribution_continuous = st.norm(loc=5, scale=1)
    distribution_categorical = st.lognorm(0.954)

    for row in range(200):
        writer.writerow([np.random.randint(0, 500)] + [np.random.randint(0, 500)] + [distribution_continuous.rvs(size=1)] + [int(distribution_categorical.rvs(size=1)*10)])
    

Below is the DataProfiler pipeline. Notice that the output profile includes all necessary information about attribute distribution to synthesize data. 

In [None]:
'''
DataProfiler Pipeline
'''
# Load data from CSV
data = Data("demo_graph.csv")
profile = Profiler(data)
report = profile.report()

printer = pprint.PrettyPrinter(sort_dicts=False, compact=True)
printer.pprint(report)

Next, we use the above profile to generate synthetic graph data. The returning graph will have the same number of nodes, approximately the same number of edges (with variance), and statistically similar distributions for graph attributes.

In [29]:
'''
Synthetic Graph Data Pipeline
'''
generator = GraphDataGenerator(report)
graph = generator.synthesize()

Below is some plots to test graph structure and the generated data distributions.

In [None]:
print("Generated graph # edges:", graph.number_of_edges())
print("Generated graph # nodes:", graph.number_of_nodes())

In [None]:
''' 
Plot continuous attribute distributions
'''
# continuous
fig, ax1 = plt.subplots()
data = generator.sample_continuous("continuous_weight", 1500)
ax1.hist(list(data), bins=100)
pts = np.linspace(2, 8)
ax2 = ax1.twinx()
ax2.set_ylim(0, 0.6)
ax2.plot(pts, st.norm(loc=5, scale=1).pdf(pts), color='red')
plt.title("Sampled generated data (blue) vs. expected probability distribution (red)")
plt.show()



In [None]:
'''Categorical Data Sampling'''
# expected
categorical_hist = []
with open("demo_graph.csv") as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
        categorical_hist.append(row[3])
categorical_hist = categorical_hist[1:]

for index in range(len(categorical_hist)):
    categorical_hist[index] = int(categorical_hist[index])
expected_hist, bin_edge = np.histogram(categorical_hist, bins='auto')

# sample
data = []
for n in range(0, 2000):
    data.append(generator.sample_categorical("categorical_status"))

sample_hist, edges = np.histogram(data, bins=bin_edge, density=False)

sample_hist = sample_hist/np.max(sample_hist)
expected_hist = expected_hist/np.max(expected_hist)

num_bin = 28
bin_lims = np.linspace(0,1,num_bin+1)
bin_centers = 0.5*(bin_lims[:-1]+bin_lims[1:])
bin_widths = bin_lims[1:]-bin_lims[:-1]

fig, (ax1,ax2) = plt.subplots(nrows = 1, ncols = 2)
ax1.bar(bin_centers, sample_hist, width = bin_widths, align = 'center')
ax2.bar(bin_centers, expected_hist, width = bin_widths, align = 'center', alpha = 0.5)
ax1.set_title('amplitude-normalized\nexpected distribution')
ax2.set_title('amplitude-normalized\nsampled distribution')
plt.show()

# Conclusion

We have shown how to use the DataProfiler and the synthetic graph data generator together in order to create a synthetic graph from a synthetic data. We notice that the generated graph has a similar structure and statistically similar attribute distributions to the input graph data.