In [1]:
import numpy as np
import pandas as pd
import json
import xml.dom.minidom

In [2]:
all_files_paths = [f for (d,fs) in json.load(open('keep_files.txt','r')).items() for f in fs]

In [4]:
f = open('nyt_label.vocab', 'r')
label_vocab_s = f.readlines()
f.close()
label_vocab = []
for label in label_vocab_s:
    label = label.strip()
    label_vocab.append(label)

In [4]:
def get_text_for_file(f_p):
    dom = xml.dom.minidom.parse(f_p)
    root = dom.documentElement
    tags = root.getElementsByTagName('p')
    text = ''
    for tag in tags[1:]:
        text += ' '+tag.firstChild.data.strip()
    text = text.strip()

    sample_label = []
    tags = root.getElementsByTagName('classifier')
    for tag in tags:
        type = tag.getAttribute('type')
        if type != 'taxonomic_classifier':
            continue
        hier_path = tag.firstChild.data
        hier_list = hier_path.split('/')
        if len(hier_list) < 3:
            continue
        for l in range(1, len(hier_list) + 1):
            label = '/'.join(hier_list[:l])
            if label == 'Top':
                continue
            if label not in sample_label and label in label_vocab:
                sample_label.append(label)

    return text, sample_label

In [5]:
data = []
for f_path in all_files_paths:
    text, label = get_text_for_file(f_path)
    data.append([
        f_path,  # id
        label  # topics
    ])

In [6]:
df = pd.DataFrame(data, columns = ['id', 'topics'])
df.head()

Unnamed: 0,id,topics
0,data/2006/05/31/1765639.xml,"[Top/Opinion, Top/Opinion/Opinion, Top/Opinion..."
1,data/2006/05/31/1765692.xml,"[Top/News, Top/News/New York and Region, Top/F..."
2,data/2006/05/31/1765687.xml,"[Top/News, Top/News/New York and Region, Top/F..."
3,data/2006/05/03/1758870.xml,"[Top/News, Top/News/U.S., Top/News/U.S./U.S. S..."
4,data/2006/05/03/1758868.xml,"[Top/Features, Top/Features/Arts, Top/Features..."


In [7]:
df.to_csv('nyt_metadata.csv', index=False)

In [8]:
df.shape

(36355, 2)

## Data Level Statistics

In [2]:
df = pd.read_csv('nyt_metadata.csv')
df.topics = df.topics.apply(lambda x: eval(x))

In [12]:
ndf = df.explode('topics')

In [18]:
ndf['topic_level'] = ndf.topics.str.split('/').str.len()-1

In [None]:
topic_levels_df = ndf.groupby(
    'topic_level'
)['topics'].unique(
).reset_index(
)
topic_levels_df['num_topics'] = topic_levels_df.topics.str.len()

In [None]:
level_docs = ndf.groupby(
    'topic_level'
)['id'].unique(
).reset_index(
)
level_docs['num_docs'] = level_docs.id.str.len()

In [33]:
level_topic_docs = level_docs.merge(
    topic_levels_df,
    on=['topic_level']
)[['topic_level','num_docs','num_topics']]
level_topic_docs

Unnamed: 0,topic_level,num_docs,num_topics
0,1,36355,4
1,2,36342,27
2,3,25441,51
3,4,21335,47
4,5,15169,17
5,6,10172,12
6,7,5503,6
7,8,2901,2


In [37]:
topic_docs = ndf.groupby(
    'topics'
)['id'].unique(
).reset_index(
)
topic_docs['num_docs'] = topic_docs.id.str.len()
topic_docs['topic_level'] = topic_docs.topics.str.split('/').str.len()-1
topic_docs

Unnamed: 0,topics,id,num_docs,topic_level
0,Top/Classifieds,"[data/2006/05/31/1765687.xml, data/2006/05/03/...",12044,1
1,Top/Classifieds/Job Market,"[data/2006/05/31/1765687.xml, data/2006/05/03/...",9006,2
2,Top/Classifieds/Job Market/Job Categories,"[data/2006/05/31/1765687.xml, data/2006/05/03/...",8999,3
3,Top/Classifieds/Job Market/Job Categories/Art ...,"[data/2006/05/26/1764346.xml, data/2006/05/11/...",436,4
4,Top/Classifieds/Job Market/Job Categories/Bank...,"[data/2006/05/02/1758721.xml, data/2006/05/02/...",1982,4
...,...,...,...,...
161,Top/Opinion/Opinion/Editorials,"[data/2006/05/09/1760429.xml, data/2006/05/29/...",566,3
162,Top/Opinion/Opinion/Letters,"[data/2006/05/31/1765639.xml, data/2006/05/28/...",1685,3
163,Top/Opinion/Opinion/Op-Ed,"[data/2006/05/28/1765050.xml, data/2006/05/17/...",468,3
164,Top/Opinion/Opinion/Op-Ed/Columnists,"[data/2006/05/28/1765050.xml, data/1998/12/12/...",181,4


In [38]:
topic_docs.groupby(
    'topic_level'
)['num_docs'].median(
).reset_index(
)

Unnamed: 0,topic_level,num_docs
0,1,16077.5
1,2,1145.0
2,3,474.0
3,4,392.0
4,5,424.0
5,6,287.5
6,7,362.0
7,8,1458.5


In [42]:
level_topic_docs['avg_num_docs'] = level_topic_docs['num_docs']/level_topic_docs['num_topics']
level_topic_docs[['topic_level','avg_num_docs']]

Unnamed: 0,topic_level,avg_num_docs
0,1,9088.75
1,2,1346.0
2,3,498.843137
3,4,453.93617
4,5,892.294118
5,6,847.666667
6,7,917.166667
7,8,1450.5
