In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import functools
%matplotlib inline

## Read in data
Source EDA-prepare-data.ipynb, clean_taxons.py, clean_content.py

In [None]:
labelled = pd.read_csv('../../data/labelled.csv', dtype=object)
filtered = pd.read_csv('../../data/filtered.csv', dtype=object)

In [None]:
labelled_level2 = pd.read_csv('../../data/labelled_level2.csv', dtype=object)

In [None]:
taxons = pd.read_csv('../../data/clean_taxons.csv', dtype=object)

# Taxons

In [None]:
#Count taxons by level1 and level2
mask = taxons['level1taxon'].notnull() & taxons['level2taxon'].isnull()

print("There are {} out of {} taxons with a level2"
      .format(sum(taxons['level2taxon'].notnull()), taxons.shape[0]))
print("There are {} out of {} taxons with a level1 tag, but no level2 tag"
      .format(sum(mask), taxons.shape[0]))

### How many taxons at each level?

In [None]:
level1_only = taxons[taxons['level1taxon'].notnull() & taxons['level2taxon'].isnull()]
level2_only = taxons[taxons['level2taxon'].notnull() & taxons['level3taxon'].isnull()]
level3_only = taxons[taxons['level3taxon'].notnull() & taxons['level4taxon'].isnull()]
level4_only = taxons[taxons['level4taxon'].notnull() & taxons['level5taxon'].isnull()]
level5_only = taxons[taxons['level5taxon'].notnull()]

print("Number of taxons at level1 = {}".format(level1_only.shape[0]))
print("Number of taxons at level2 = {}".format(level2_only.shape[0]))
print("Number of taxons at level3 = {}".format(level3_only.shape[0]))
print("Number of taxons at level4 = {}".format(level4_only.shape[0]))
print("Number of taxons at level5 = {}".format(level5_only.shape[0]))

### How many content items at each level?

In [None]:
level1_only = filtered[filtered['level1taxon'].notnull() & filtered['level2taxon'].isnull()]
level2_only = filtered[filtered['level2taxon'].notnull() & filtered['level3taxon'].isnull()]
level3_only = filtered[filtered['level3taxon'].notnull() & filtered['level4taxon'].isnull()]
level4_only = filtered[filtered['level4taxon'].notnull() & filtered['level5taxon'].isnull()]
level5_only = filtered[filtered['level5taxon'].notnull()]

print("Number of times content items tagged to level1 = {}".format(level1_only.shape[0]))
print("Number of times content items tagged to level2 = {}".format(level2_only.shape[0]))
print("Number of times content items tagged to level3 = {}".format(level3_only.shape[0]))
print("Number of times content items tagged to level4 = {}".format(level4_only.shape[0]))
print("Number of times content items tagged to level5 = {}".format(level5_only.shape[0]))
print("")
print("Number of unique content items tagged to level1 = {}".format(level1_only.content_id.nunique()))
print("Number of unique content items tagged to level2 = {}".format(level2_only.content_id.nunique()))
print("Number of unique content items tagged to level3 = {}".format(level3_only.content_id.nunique()))
print("Number of unique content items tagged to level4 = {}".format(level4_only.content_id.nunique()))
print("Number of unique content items tagged to level5 = {}".format(level5_only.content_id.nunique()))

### Number of tags per content level, overall 

In [14]:
times_tagged_labelled=labelled.groupby('content_id').size().sort_values(ascending=True)
times_tagged_filtered=filtered.groupby('content_id').size().sort_values(ascending=True)
times_tagged_labelled2=labelled_level2.groupby('content_id').size().sort_values(ascending=True)

In [23]:
times_tagged_labelled.value_counts()

1     67940
2     36882
3     12832
4      6919
5      2113
6      1054
7       431
8       172
9        65
10       48
11       26
12       22
21       20
18       14
13       11
14        9
15        3
16        3
17        3
19        3
27        2
42        1
20        1
24        1
31        1
63        1
dtype: int64

In [24]:
print("filtered content tagged > 1 taxon ={} and to a single taxon={}".format(len(times_tagged_filtered[times_tagged_filtered>1]), len(times_tagged_filtered[times_tagged_filtered==1])))
print("labelled content tagged > 1 taxon ={}".format(len(times_tagged_labelled[times_tagged_labelled>1])))
print("labelled level2  content tagged > 1 taxon ={}".format(len(times_tagged_labelled2[times_tagged_labelled2>1])))

filtered content tagged > 1 taxon =59901 and to a single taxon=67419
labelled content tagged > 1 taxon =60637
labelled level2  content tagged > 1 taxon =38144


In [15]:
print("The percent of filtered content tagged to more than 1 taxon ={}".format(len(times_tagged_filtered[times_tagged_filtered>1])/len(times_tagged_filtered)*100))
print("The percent of labelled content tagged to more than 1 taxon ={}".format(len(times_tagged_labelled[times_tagged_labelled>1])/len(times_tagged_labelled)*100))
print("The percent of labelled level2  content tagged to more than 1 taxon ={}".format(len(times_tagged_labelled2[times_tagged_labelled2>1])/len(times_tagged_labelled2)*100))

The percent of filtered content tagged more to more than 1 taxon =47.04759660697455
The percent of labelled content tagged more to more than 1 taxon =47.160067508185755
The percent of labelled level2  content tagged more to more than 1 taxon =33.44556677890011


In [10]:
filtered.groupby('content_id').size().sort_values(ascending=True).describe()

count    127320.000000
mean          1.799277
std           1.202716
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max          41.000000
dtype: float64

In [11]:
print("The percent of filtered content tagged more to more than 1 taxon ={}".format(len(times_tagged_filtered[times_tagged_labelled>1])/len(times_tagged_labelled)*100))

The percent of filtered content tagged more to more than 1 taxon =46.999074484550114


In [None]:
times_tagged_filtered.hist(bins=40)

In [None]:
labelled.columns

In [None]:
times_tagged_labelled=labelled.groupby('base_path').size().sort_values(ascending=True)
labelled.groupby('content_id').size().sort_values(ascending=True).describe()

In [None]:
times_tagged_labelled.head()

In [None]:
times_tagged_labelled.hist(bins=63)

In [None]:
print("The percent of labelled content tagged more to more than 2 taxons ={}".format(len(times_tagged_labelled[times_tagged_labelled>2])/len(times_tagged_labelled)*100))

In [None]:
len(times_tagged_labelled[times_tagged_labelled>2])

In [None]:
#save out content tagged to more than 10 taxons
morethan10taxons=times_tagged_labelled[times_tagged_labelled>10]
morethan10taxons.to_csv('../../data/taggedtomorethan10taxons.csv', index=True)
morethan10taxons.shape

### Number of tags per content item at each level

In [None]:
level1_only.groupby('content_id').size().sort_values(ascending=True).describe()

In [None]:
level2_only.groupby('content_id').size().sort_values(ascending=True).describe()

In [None]:
level3_only.groupby('content_id').size().sort_values(ascending=True).describe()

In [None]:
level4_only.groupby('content_id').size().sort_values(ascending=True).describe()

In [None]:
level5_only.groupby('content_id').size().sort_values(ascending=True).describe()

## Level1  taxons

Expecting 19 top taxons (plus `None`)

In [None]:
assert filtered.level1taxon.nunique() == 19

print("There are {} unique taxon names in the level 1 taxon"
      .format(filtered.level1taxon.nunique()))

set(filtered.level1taxon)

In [None]:
topfreq = filtered.groupby('level1taxon').size()
topfreq.sort_values(ascending=False)

In [None]:
topfreq.sort_values().plot(kind = 'barh', figsize=(20, 20))

Formally, Kendall's τ coefficient is defined as:

$${\tau ={\frac {({\text{number of concordant pairs}})-({\text{number of discordant pairs}})}{N(N-1)/2}}}$$

In [None]:
level1_counts= filtered.groupby(['content_id', 'level1taxon']).size().unstack(fill_value=0)
level1_corr=level1_counts.corr(method = "kendall")

In [None]:
#get level1taxons which are often both tagged to

s = level1_corr.unstack()
so = s.sort_values(kind="quicksort", ascending=False)

different_taxons=so[so != 1]
concordant_taxons=different_taxons[different_taxons > 0.1]
concordant_taxons

In [None]:
# plot the heatmap
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(level1_corr, 
        xticklabels=level1_corr.columns,
        yticklabels=level1_corr.columns)

In [None]:
top_doctype = pd.crosstab(filtered['document_type'], filtered['level1taxon'])
fig, ax = plt.subplots(figsize=(10,10)) 
sns.heatmap(top_doctype, cmap = "YlGnBu")

In [None]:
top_pubapp = pd.crosstab(filtered['publishing_app'], filtered['level1taxon'])
fig, ax = plt.subplots(figsize=(10,5))
sns.heatmap(top_pubapp, cmap = "YlGnBu")

In [None]:
filtered.groupby('level1taxon').size().sort_values(ascending=False)

### Level 2 taxons

In [None]:
#assert content_taxons_dedup.level2taxon.nunique() == 103

print("There are {} unique taxon names in the level 2 taxon"
      .format(filtered.level2taxon.nunique()))

In [None]:
second_freq = filtered.groupby('level2taxon').size().sort_values(ascending=False)

# Print as string to get around truncation

print(second_freq.to_string())



In [None]:
second_freq.sort_values().plot(kind = 'barh', figsize=(10,30))

In [None]:
level2_counts= filtered.groupby(['content_id', 'level2taxon']).size().unstack(fill_value=0)
level2_corr=level2_counts.corr(method = "kendall")

In [None]:
#get level1taxons which are often both tagged to

s = level2_corr.unstack()
so = s.sort_values(kind="quicksort", ascending=False)

different_2taxons=so[so != 1]
concordant_2taxons=different_2taxons[different_2taxons > 0.1]

In [None]:
concordant_2taxons.shape

In [None]:
concordant_2taxons.to_csv('../../data/level2taxons_concordant.csv', index=True)

In [None]:
# plot the heatmap
# fig, ax = plt.subplots(figsize=(20,20))
# sns.set(font_scale=0.4)
# sns.heatmap(level2_corr, 
#         xticklabels=level2_corr.columns,
#         yticklabels=level2_corr.columns)

### Mutually exclusive taxon combinations (level1 and level2)

In [None]:
#Only keep rows where level1/level2 combination is unique
level2_dedup = labelled.drop_duplicates(subset = ['content_id', 'level1taxon', 'level2taxon']).copy()
#Identify and drop rows where level2 is missing
mask= pd.notnull(level2_dedup['level2taxon'])
level2_tagged = level2_dedup[mask]

In [None]:
#concatenate the name of each level2taxon for a single content item
level2_tagged = level2_tagged.groupby('content_id')['level2taxon'].apply('-;-'.join).reset_index()
print("there are {} mutually exclusive combinations of level2 taxon combinations.".format(level2_tagged.level2taxon.nunique()))

In [None]:
inline_rc = dict(mpl.rcParams)
#Get frequency counts for each of the mutually exclusive taxon2 combinations
mutualex_freq = level2_tagged.groupby('level2taxon').size().sort_values(ascending=True)

#Keep those with higher frequency
mutualex_freq_top = mutualex_freq[mutualex_freq > 100]
print("There are {} mutually exclusive combinations of level2 taxon combinations populated with more than 100 content items".format(len(mutualex_freq_top)))

mutualex_freq_top.plot(kind = 'barh', figsize=(10,30))

In [None]:
# Print as string to get around truncation

print(mutualex_freq.to_string())

## Ariana counts
-exlude world and corporate
-count how many taxons at each level by level 2