In [1]:
import pandas as pd
import os
import numpy as np
import json
from ast import literal_eval

In [2]:
DATA_DIR = os.getenv("DATA_DIR")
filename = "preprocessed_with_dupes_31_10_taxon2.csv.gz"
path = os.path.join(DATA_DIR,"output", filename)

In [3]:
df = pd.read_csv(path,sep="\t",compression="gzip")

In [None]:
df["Taxon_List"] = df["Taxon_List"].map(literal_eval)

In [None]:
def taxon_split(taxon_list):
    return [t for taxon in taxon_list for t in taxon.split(",")]

#### Build list of unique taxons, excluding "other"

In [None]:
taxon_counter = Counter()
for tup in df.itertuples():
    taxons = taxon_split(tup.Taxon_List)
    for taxon in taxons:
        taxon_counter[taxon]+=1
len(taxon_counter)      

#### Map taxon `content_id` to `base_path` using content tagger extract

In [None]:
taxon_path = os.path.join(os.getenv("DOCUMENTS"),"taxons.json.gz")
taxon_df = pd.read_json(taxon_path,compression="gzip")

In [None]:
# taxon_path = os.path.join(os.path.dirname(os.getenv("DOCUMENTS")), "Downloads", "2018-11-19 Taxonomy.csv")
# taxon_df = pd.read_csv(taxon_path)

In [None]:
taxon_df

In [None]:
taxon_df.shape

In [None]:
taxon_df.columns

## Count taxons present in both journeys and taxon export and write to file

In [None]:
found = 0
with open("taxon_id_title_311018.tsv","w") as writer:
    writer.write("content_id\ttitle\tbase_path\tparent_content_id\n")
    for taxon,value in taxon_counter.items():
        temp = taxon_df[taxon_df.content_id==taxon]
        if temp.shape[0]>0:
            found +=1
#             print(taxon,",",temp.iloc[0].title)
            writer.write("{}\t{}\t{}\t{}\n".format(taxon,
                                               temp.iloc[0].title,
                                               temp.iloc[0].base_path,
                                               temp.iloc[0].parent_content_id))
found

In [None]:
(found*100)/taxon_df.shape[0]

## Translate content_id to level + parents

def recursive_parenting(df,content_id,parent_content_id,parent_list):
    if isinstance(parent_content_id,float) and len(parent_list)==0:
        return []
    elif isinstance(parent_content_id,float):
        return [[thing,i+1]for i,thing in enumerate(reversed(parent_list))]
    else:
        content_id = parent_content_id
        parent_content_id = df[df.content_id==parent_content_id].iloc[0].parent_content_id
        title = df[df.content_id==content_id].iloc[0].title
        parent_list.append([content_id,parent_content_id,title])
        return recursive_parenting(df,content_id,parent_content_id,parent_list)

In [None]:
column_list = ['content_id','title','level','parents','level1_parent']
taxon_level_df = pd.DataFrame(columns=column_list)
missed=0
for content_id,value in taxon_counter.items():
    if taxon_df[taxon_df.content_id==content_id].shape[0] > 0:
        title = taxon_df[taxon_df.content_id==content_id].iloc[0].title
        parent_list = pd.Series(recursive_parenting(taxon_df,content_id,
                        taxon_df[taxon_df.content_id==content_id].parent_content_id.values[0],[]))
        current_level = len(parent_list)+1
        level1_par = title
        if len(parent_list.values) > 0:
            level1_par = parent_list.values[0][0][2]
        taxon_level_df = pd.concat([taxon_level_df,pd.DataFrame([[content_id,
                                                                  title,
                                                                  current_level,
                                                                  parent_list.values,
                                                                  level1_par]],columns=column_list)])

In [None]:
taxon_level_df

In [None]:
taxon_level_df.to_csv("taxon_level_df.tsv",sep='\t',index=False)

## Count parent taxons, self-parenting if nan

In [None]:
counter =0
parent_taxons = Counter()
for taxon,value in taxon_counter.items():
    temp = taxon_df[taxon_df.content_id==taxon]
    if temp.shape[0]>0:
        taxon_base_path = temp.iloc[0].base_path
        parent = None
        if isinstance(temp.iloc[0].parent_content_id,str):
            parent = taxon_df[taxon_df.content_id == temp.iloc[0].parent_content_id].iloc[0].title
        else:
            parent = temp.iloc[0].title
        parent_taxons[parent]+=value

In [None]:
list(parent_taxons.most_common(30))

In [None]:
len(parent_taxons)