In [1]:
import pandas as pd 
import glob
import jsonlines
from tqdm.auto import tqdm

attribution_files = glob.glob('../data/nyt-ldc/nyt-ldc-attribution*')
detection_files  = glob.glob('../data/nyt-ldc/nyt-ldc-data-with-detectio*')
source_type_files = glob.glob('../data/nyt-ldc/nyt-ldc-full-attribution-*')

In [2]:
attribution_lines, detection_lines, source_lines = [], [], []
# for f in tqdm(attribution_files):
#     attribution_lines += list(jsonlines.open(f))

# for f in tqdm(detection_files):
#     detection_lines += list(jsonlines.open(f))

for f in tqdm(source_type_files):
    source_lines += list(jsonlines.open(f))

  0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
len(source_lines)

716113

In [4]:
source_lines_df = list(map(pd.DataFrame, tqdm(source_lines)))

  0%|          | 0/716113 [00:00<?, ?it/s]

In [6]:
def format_id(doc_id):
    doc_id_parts = doc_id.split('/')
    doc_id_parts = doc_id_parts[:4]
    return '/'.join(doc_id_parts)

In [77]:
all_source_type_counts = []

for df in tqdm(source_lines_df):
    doc_id = df['doc_id'].iloc[0]
    counts = df.pipe(lambda df: 
               df[['affiliation', 'role']]
                   .value_counts()
    )
    counts['doc_id'] = doc_id
    all_source_type_counts.append(counts.to_dict())

  0%|          | 0/716113 [00:00<?, ?it/s]

In [82]:
def process_key(k):
    if k[0] != 'doc_id':
        return f'{k[0]}----{k[1]}'
    else:
        return k[0]

with jsonlines.open('../data/nyt-ldc/source-type-counts.jsonl', 'w') as f:
    for line in tqdm(all_source_type_counts):
        line = {process_key(k): v for k, v in line.items()}
        f.write(line)

  0%|          | 0/716113 [00:00<?, ?it/s]

In [92]:
all_counts_df = pd.read_json('../data/nyt-ldc/source-type-counts.jsonl', lines=True)
all_counts_df= all_counts_df.fillna(0)
all_counts_df= all_counts_df.loc[lambda df: df['oom error----oom error'] == 0]
all_counts_df= all_counts_df.set_index('doc_id')

In [None]:
metadata_df = pd.read_csv('../data/nyt-ldc/nyt-ldc-docs-to-score.csv.gz', index_col=0)
metadata_cols = metadata_df.columns
all_source_cols = all_counts_df.columns

In [142]:
# all_counts_df.columns = pd.MultiIndex.from_tuples( list(map(lambda x: x.split('----'), all_counts_df.columns)))

In [143]:
all_counts_with_metadata = (
    all_counts_df
        .assign(all_source_count = lambda df: df.sum(axis=1))
        .merge(metadata_df, left_index=True, right_on='id')
)

In [154]:
sources_per_section = (
    all_counts_with_metadata
         .assign(online_cols_split=lambda df: df['online_sections'].str.split('; '))
         .explode('online_cols_split')
)

num_sources_per_section = (
    sources_per_section
        .groupby('online_cols_split')
        ['all_source_count'].mean()
        .sort_values()
)

In [155]:
num_sources_per_section

online_cols_split
Obituaries              5.417801
Dining and Wine         6.643762
Arts                    7.289793
Theater                 7.647337
Movies                  7.691358
Opinion                 7.930093
Automobiles             8.087426
Travel                  8.148273
Style                   8.178161
Home and Garden         8.334816
Books                   8.361420
Science                 8.844915
Technology              9.526323
New York and Region     9.539377
Magazine                9.865810
Sports                  9.972617
Job Market             10.355045
World                  10.426499
Real Estate            10.515976
Health                 10.564926
Business               10.805036
Washington             11.018568
Education              11.124660
Week in Review         11.316179
U.S.                   11.377240
The Public Editor      13.000000
Front Page             14.046002
Name: all_source_count, dtype: float64

In [170]:
source_cols_filt = list(filter(lambda x: ('Cannot Determine' not in x) and ('oom' not in x), all_source_cols))

In [192]:
(sources_per_section
     .groupby('online_cols_split')[source_cols_filt]
     .sum()
     .pipe(lambda df: df.divide(df.sum(axis=1), axis=0))
     .rename(columns=lambda x: x.replace('----', ', '))
     .apply(lambda x: {k:round(v, 2) for k,v in x.sort_values(ascending=False).items()} ,axis=1)
     .apply(lambda x: pd.Series({i: f'{k}: {v}' for i, (k, v) in enumerate(list(x.items())[:5])}))
)

Unnamed: 0_level_0,0,1,2,3,4
online_cols_split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Arts,"Actor, Participant: 0.25","Witness, Participant: 0.13","Media, Participant: 0.11","Media, Informational: 0.08","Government, Participant: 0.07"
Automobiles,"Corporate, Participant: 0.25","Witness, Participant: 0.13","Corporate, Informational: 0.12","Actor, Participant: 0.09","Media, Informational: 0.06"
Books,"Actor, Participant: 0.22","Witness, Participant: 0.13","Media, Participant: 0.1","Media, Informational: 0.09","Government, Participant: 0.06"
Business,"Corporate, Informational: 0.22","Corporate, Participant: 0.21","Government, Participant: 0.11","Government, Informational: 0.06","Industry Group, Informational: 0.05"
Dining and Wine,"Witness, Participant: 0.19","Actor, Participant: 0.14","Corporate, Participant: 0.12","Media, Informational: 0.08","Witness, Informational: 0.08"
Education,"Government, Participant: 0.25","Academic, Participant: 0.11","Witness, Participant: 0.08","Academic, Informational: 0.07","Actor, Participant: 0.07"
Front Page,"Government, Participant: 0.31","Government, Informational: 0.12","Political Group, Participant: 0.07","Witness, Participant: 0.05","Corporate, Participant: 0.04"
Health,"Government, Participant: 0.19","Academic, Informational: 0.16","Government, Informational: 0.08","Corporate, Participant: 0.06","Actor, Participant: 0.06"
Home and Garden,"Actor, Participant: 0.17","Witness, Participant: 0.15","Corporate, Participant: 0.12","Media, Informational: 0.07","Academic, Informational: 0.06"
Job Market,"Corporate, Participant: 0.17","Actor, Participant: 0.13","Witness, Participant: 0.12","Corporate, Informational: 0.07","Academic, Informational: 0.06"


In [15]:
all_source_lines_df = pd.concat(source_lines_df)

KeyboardInterrupt: 

In [None]:
all_source_lines_df.shape