In [58]:
import pandas as pd

# 880 Record-based Analysis
The following analysis is on the bib-level records with 245-880 field links for a title and alternate script title. These rows were derived from previous analysis to capture MARC data for every volume with an 245-880 link. The data was then deduplicated by the contributor ILS number for an institution (`contribsys_id`). This prevents duplicates in cases of serials.

In [70]:
df = pd.read_csv('880_records_dataset.tsv', sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 725818 entries, 0 to 725817
Data columns (total 8 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   cid              725818 non-null  int64 
 1   namespace        725818 non-null  object
 2   contribsys_id    725818 non-null  object
 3   htid             725818 non-null  object
 4   language         725776 non-null  object
 5   selection_order  725818 non-null  int64 
 6   title            725818 non-null  object
 7   880              722821 non-null  object
dtypes: int64(2), object(6)
memory usage: 44.3+ MB


### Total records and total clusters

In [74]:
# Total records and total clusters with 245-880 links
print("Total bibliographic record count with 245-880 links", df.shape[0])
print("Total clusters with 245-880 links", df['cid'].nunique())

Total bibliographic record count with 245-880 links 725818
Total clusters with 245-880 links 660606


### Contributors in Dataset
The following is the top 15 contributors in the dataset by `namespace` and count of records.

In [60]:
lang_summary = df.groupby('namespace').size().reset_index(name='count').sort_values('count', ascending=False)
print(lang_summary.head(15).to_string(index=False))

namespace  count
      MiU 358844
      uc1 242241
       wu  28687
     keio  23193
      uva  13645
     uiug   9245
      osu   9053
      hvd   7291
      coo   5919
     nnc1   4511
      nyp   4390
      inu   3111
      umn   2282
      IaU   2279
      mcg   1508


### Languages in Dataset

In [61]:
# Unique languages
print(df['language'].unique())

['rus' 'eng' 'chi' 'ukr' 'ara' 'jpn' 'kor' 'ger' 'srp' 'bul' 'heb' 'per'
 'yid' 'mul' 'tha' 'mac' 'gre' 'est' 'bel' 'urd' 'zxx' 'arm' 'hin' 'lav'
 'san' 'egy' 'tam' 'fre' 'uzb' 'scc' 'bos' 'hrv' 'pus' 'tib' 'dut' 'bur'
 'por' 'mnc' 'vie' nan 'fiu' 'mar' 'kan' 'pra' 'ben' 'pan' 'mwr' 'bra'
 'mai' 'raj' 'tel' 'mal' 'aze' 'grc' 'lad' 'rum' 'cze' 'und' 'guj' 'scr'
 'lit' 'tur' 'kur' 'che' 'mis' 'kaz' 'hun' 'mon' 'pli' 'pol' 'geo' 'chu'
 'lat' 'cop' 'sah' 'jap' 'ota' 'jav' 'arc' 'tah' 'gon' 'tut' 'spa' 'ber'
 'kir' 'ava' 'syr' 'kas' 'chg' 'dra' 'ita' 'snd' 'lah' 'jrb' 'him' 'inc'
 'chn' 'tuk' 'rom' 'tat' 'crh' 'bak' 'tgk' 'kaa' 'inh' 'sla' 'chm' 'chv'
 'kok' 'uig' 'doi' 'abk' 'kar' 'oss' 'myv' 'sam' 'ind' 'jpr' 'kum' 'dan'
 'oto' 'kbd' '|||' 'tgl' 'amh' 'fin' 'som' 'khm' 'akk' 'may' 'art' 'pal'
 'udm' 'lao' 'nep']


In [62]:
# Top 15 languages
lang_summary = df.groupby('language').size().reset_index(name='count').sort_values('count', ascending=False)
print(lang_summary.head(15).to_string(index=False))


language  count
     chi 292505
     jpn 234301
     ara  61898
     rus  41339
     kor  36173
     heb  21967
     per   8300
     urd   8175
     yid   3342
     tha   2587
     eng   1893
     ukr   1688
     hin   1400
     arm   1382
     bul   1270


In [63]:
# table of counts of each selection order
so_summary = df.groupby('selection_order').size().reset_index(name='count')
print(so_summary)

    selection_order   count
0                 1  660606
1                 2   56339
2                 3    6895
3                 4    1242
4                 5     408
5                 6     184
6                 7      81
7                 8      38
8                 9      16
9                10       3
10               11       3
11               12       2
12               13       1


In [64]:
#get election order greater than 1
so_gt_1 = df[df['selection_order'] > 1]

In [65]:
# print cid with selection order greater than 1
print("CIDs w/ mutliple contributors:", len(so_gt_1['cid'].unique()))

CIDs w/ mutliple contributors: 56339


In [66]:
# select from df where cid is in so_gt_1
multi_contrib_df = df[df['cid'].isin(so_gt_1['cid'])]


In [67]:
# print counts summarized by namespaces in multi_contrib_df
multi_contrib_summary = multi_contrib_df.groupby('namespace').size().reset_index(name='count').sort_values('count', ascending=False)
print(multi_contrib_summary.head(10).to_string(index=False))

namespace  count
      uc1  60457
      MiU  47711
      uva   3955
       wu   2525
      osu   1616
     uiug   1561
      coo    839
      umn    766
      hvd    498
      IaU    462


In [68]:
# print with counts, top 10 languages
so_gt_1_summary = so_gt_1.groupby('language').size().reset_index(name='count').sort_values('count', ascending=False)
print("languages with overlap indicated by selection_order > 1")
print(so_gt_1_summary.head(10).to_string(index=False))

languages with overlap indicated by selection_order > 1
language  count
     chi  37590
     jpn  22053
     kor   2514
     ara   1023
     rus    826
     heb    460
     yid    194
     eng    174
     per     89
     mul     58


In [69]:
# count of rows where the 880 field not properly linked to 245 field, null in 880

# print(df[df['880'].isnull()])
print("245 title fields with broken links to 880:", len(df[df['880'].isnull()]))

245 title fields with broken links to 880: 2997
