In [43]:
import pandas as pd
import numpy as np
import json

In [3]:
df = pd.read_csv('../../orig-power-soton-page-flow.csv')

In [4]:
# Create a new transformed dataset with rows of source and target pages
df_transformed = pd.DataFrame(
    columns = ['user_number', 'session_number', 'session_start_time', 'source', 'source_duration', 'target']
)
column_names = df.columns.values.tolist()
new_row_index = 0
for index, row in df.iterrows():
    for column_name in column_names:
        if 'Unnamed' in column_name and not int(column_name[9:]) % 2 == 0 and pd.notnull(df.loc[index, column_name]):
            column_number = int(column_name[9:])
            if column_number <= 167:
                df_transformed.set_value(new_row_index, 'source', row[column_name])
                df_transformed.set_value(new_row_index, 'source_duration', row[column_name[:9] + str(column_number + 1)])
            if column_number <= 166:
                df_transformed.set_value(new_row_index, 'target', row[column_name[:9] + str(column_number + 2)])
            
            df_transformed.set_value(new_row_index, 'user_number', row['user number'])
            df_transformed.set_value(new_row_index, 'session_number', row['session number'])
            df_transformed.set_value(new_row_index, 'session_start_time', row['session start time'])
            
            new_row_index += 1

In [5]:
# dump transformed dataset to csv file
df_transformed.to_csv('./data/power-page-flow-transformed-data.csv', na_rep = 'None', index = False)

In [54]:
# count the number of unique source and target pairs
df_traffic_pairs = df_transformed.groupby(['source', 'target']).size().reset_index().rename(columns={0:'value'})

In [55]:
# store traffic pairs as a list
traffic_pairs_list = df_traffic_pairs.to_dict(orient='record')

In [72]:
# count the number times a source page appears (visits)
source_visits = df_transformed.groupby('source').size().reset_index().rename(columns={'source': 'name', 0: 'visits'})

In [65]:
# store pages and their number of visits as a list 
page_list = source_visits.to_dict(orient='records')

In [67]:
# create a helper mapper dictionary for querying its index in the page list
nodes_dict_map = {}
for index, page in enumerate(page_list):
    nodes_dict_map[page['name']] = index

In [68]:
# create a list of dict with source and target represented by their respective index in the page list
traffic_pairs_with_index = []
for pair in traffic_pairs_list:
    traffic_pairs_with_index.append({
        'source': nodes_dict_map[pair['source']],
        'target': nodes_dict_map[pair['target']],
        'value': pair['value']
    })

In [69]:
# initial representation of flow dict
flow_dict = {
    'nodes': [{'name': '', 'visits': 0, 'avg_page_duration': 0}],
    'links': [{'source': 0, 'target': 1, 'value': 0, 'user_number': []}]
}

In [70]:
flow_dict['nodes'] = page_list
flow_dict['links'] = traffic_pairs_with_index

In [71]:
with open('./data/page-flow.json', 'w') as fp:
    json.dump(flow_dict, fp, sort_keys=True, indent=2)