In [None]:
import sys
import os
src_data = os.path.join(os.path.dirname(os.getenv("DATA_DIR")),"src/data")
sys.path.append(src_data)
import preprocess as prep
import datetime
import colorsys
import pandas as pd
import re
import numpy as np
from ast import literal_eval
from collections import Counter
import pprint
import networkx as nx
import pygraphviz
from networkx.drawing.nx_agraph import graphviz_layout
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
DATA_DIR = os.getenv("DATA_DIR")
filename = "preprocessed_with_dupes_31_10_taxon2.csv.gz"
path = os.path.join(DATA_DIR,"output", filename)

In [None]:
df = pd.read_csv(path,sep="\t",compression="gzip")

In [None]:
df.shape

In [None]:
df.columns

In [None]:
any(df.Sequence.duplicated())

In [None]:
for col in df.columns:
#     if "Sequence" not in col and not col.startswith("Event"):
#         if isinstance(df[col].iloc[0],str) and "[" in df[col].iloc[0]:
#             print(col)
#             df[col] = df[col].map(literal_eval)
    if re.search("^Taxon|^Page",col):
        if isinstance(df[col].iloc[0],str) and "[" in df[col].iloc[0]:
            print(col)
            df[col] = df[col].map(literal_eval)

## Count taxons within journeys
### Setup

In [None]:
def unique_taxon_flat_unique(taxon_list):
    return sum(Counter(set([t for taxon in taxon_list for t in taxon.split(",")])).values())
def unique_taxon_nested_unique(taxon_list):
    return sum(Counter(set([taxon for taxon in taxon_list])).values())
def unique_taxon_flat_pages(taxon_list):
    return sum(Counter([t for taxon in taxon_list for t in taxon.split(",")]).values())
def unique_taxon_nested_pages(taxon_list):
    return sum(Counter([taxon for taxon in taxon_list]).values())

In [None]:
df.iloc[0].Sequence

In [None]:
target = df.Taxon_List.iloc[1]
print(target)
print(unique_taxon_flat_unique(target))
print(unique_taxon_nested_unique(target))
print(unique_taxon_flat_pages(target))
print(unique_taxon_nested_pages(target))

In [None]:
df['taxon_flat_unique'] = df['Taxon_List'].map(unique_taxon_flat_unique)
df['taxon_nested_unique'] = df['Taxon_List'].map(unique_taxon_nested_unique)
df['taxon_flat_pages'] = df['Taxon_List'].map(unique_taxon_flat_pages)
df['taxon_nested_pages'] = df['Taxon_List'].map(unique_taxon_nested_pages)

In [None]:
df.describe().drop("count").applymap(lambda x: format(x,"f"))

In [None]:
df.describe().drop("count").applymap(lambda x: '%.2f' % x)

In [None]:
df[df.taxon_flat_unique == 429].Taxon_List.values

In [None]:
df[df.taxon_flat_unique == 0].Sequence.values

In [None]:
def taxon_split(taxon_list):
    return [t for taxon in taxon_list for t in taxon.split(",")]

In [None]:
#### Build list of unique taxons, excluding "other"
taxon_counter = Counter()
for tup in df.itertuples():
    taxons = taxon_split(tup.Taxon_List)
    for taxon in taxons:
        taxon_counter[taxon]+=1
len(taxon_counter)            

In [None]:
list(taxon_counter.keys())[0:10]

In [None]:
taxon_counter.most_common(10)

In [None]:
taxon_df = pd.read_csv("taxon_level_df.tsv",sep='\t')

### Assign unique parent taxons per journey

In [None]:
df['subpaths'] = df['Page_List'].map(prep.subpaths_from_list)

In [None]:
for val in df[['Page_List','subpaths']].iloc[0].values:
    pprint.pprint(val)
    print("\n====")

### create new subpaths where each element is a (page,parent taxon pair, pick one?)

In [None]:
def get_taxon_name(taxon_id):
    if taxon_id in taxon_df.content_id.values:
        return taxon_df[taxon_df.content_id==taxon_id].iloc[0].title
    else:
        return None

In [None]:
def taxon_title(taxon_id_list):
    return [get_taxon_name(taxon_id) for taxon_id in taxon_id_list]

In [None]:
def subpaths_from_pcd_list(pcd_list):
    return [[(page,taxon_title(taxons)), (pcd_list[i + 1][0],taxon_title(pcd_list[i + 1][1]))] 
            for i, (page,taxons) in enumerate(pcd_list) if i < len(pcd_list) - 1]

In [None]:
test_journey = df[df.PageSeq_Length>4].iloc[0]

In [None]:
pprint.pprint([p for p,_ in test_journey.Taxon_Page_List])

In [None]:
for i,element in enumerate(subpaths_from_pcd_list(test_journey.Taxon_Page_List)):
    print(i,element,"\n====")

In [None]:
df['taxon_subpaths'] = df['Taxon_Page_List'].map(subpaths_from_pcd_list)

In [None]:
# taxon_title(df.Taxon_Page_List.iloc[0][0][1])

# def add_to_taxon_dict(diction,taxon_list):
#     for taxon in taxon_list:
#         if taxon not in diction.keys():
#             diction[taxon] = get_taxon_name(taxon)

# df.Taxon_Page_List.iloc[0][0][1]

# df.Taxon_Page_List.iloc[0][1][1]

# taxon_name = {}
# add_to_taxon_dict(taxon_name,df.Taxon_Page_List.iloc[0][0][1]+df.Taxon_Page_List.iloc[0][1][1])

# taxon_name

# df.shape

# print(datetime.datetime.now().strftime("[%H:%M:%S]"))

## Graph viz

## graph some stuff based on taxon (parent?)

In [None]:
def add_page_taxon(diction,key,value):
    if key not in diction.keys():
        diction[key] = value

In [None]:
adjacency_list = {}
adjacency_counter = Counter()
freq_filter = 1000
dupe_count = 0
page_taxon_title = {}

for i,tup in enumerate(df.sort_values(by="Occurrences",ascending=False).itertuples()):
#     for page,taxon in tup.Taxon_Page_List:
    for subpath in subpaths_from_pcd_list(tup.Taxon_Page_List):
        start = subpath[0][0]
        end = subpath[1][0]
#         print(subpath[0][1]+subpath[1][1])
        adjacency_counter [(start,end)] += tup.Occurrences
        
        
        if start!=end and adjacency_counter[(start,end)] >= freq_filter:
            
            add_page_taxon(page_taxon_title,start,subpath[0][1])
            add_page_taxon(page_taxon_title,end,subpath[1][1])
          

            if start in adjacency_list.keys():
                if end not in adjacency_list[start]:
                    adjacency_list[start].append(end)
            else:
                adjacency_list[start] = [end]
                
    if len(adjacency_list)>1000:
        break
            
    if i%30000==0:
        print(datetime.datetime.now().strftime("[%H:%M:%S]"),"ind",i)
        print(len(adjacency_list))

In [None]:
len(adjacency_list)

In [None]:
list(adjacency_list.items())[0:10]

In [None]:
list(page_taxon_title.items())[0:10]

In [None]:
for page,taxons in page_taxon_title.items():
    page_taxon_title[page] = "_".join([taxon if taxon is not None else "None" for taxon in taxons])   

### Set up colors

In [None]:
N = len(page_taxon_title.values())
HSV_tuples = [(x*1.0/N, 0.5, 0.5) for x in range(N)]
RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)
RGB_tuples = list(RGB_tuples)

In [None]:
taxon_color = {taxon:RGB_tuples[i] for i,taxon in enumerate(page_taxon_title.values())}

In [None]:
digraph = nx.DiGraph()

In [None]:
for node,out_nodes in adjacency_list.items():
    color = taxon_color[page_taxon_title[node]]
    digraph.add_node(node,taxon=page_taxon_title[node],color=color)
    for o_node in out_nodes:
        color = taxon_color[page_taxon_title[o_node]]
        digraph.add_node(o_node,taxon=page_taxon_title[o_node],color=color)
        digraph.add_edge(node,o_node)

In [None]:
digraph.edges()

In [None]:
edges = digraph.edges()
color_map = [data['color'] for _,data in digraph.nodes(data=True)]
pos = nx.nx_agraph.graphviz_layout(digraph, prog='neato')
nx.draw(digraph, pos, node_size=20, fontsize=12, edges=edges, node_color=color_map)
plt.show()