### Load Libraries and Clean Initial Dataset

In [1]:
import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()
import os
import warnings
warnings.filterwarnings('ignore')
import sys
from ast import literal_eval
from tqdm import tqdm
import leafmap.kepler as leafmap
import geopandas as gpd
import numpy as np
import re
import matplotlib.pyplot as plt

sys.path.append("..")
from scripts.data_utils import get_initial_dataset, get_image_sizes, get_local_image_locations, get_contributors, get_dates
from scripts.text_manipulation import get_tfidf_vectorizer, get_count_vectorizer, get_spacy_subjects

In [2]:
local_files = get_local_image_locations(dir='private_materials/data', output_path='private_materials/datajam_data/local_files.csv')

In [3]:
is_sample = False
merged_df = get_initial_dataset('private_materials/datajam_data/initial_cleaned_sample_dataset.csv', is_sample)

In [4]:
merged_df['file_exists'] = merged_df['file_id'].isin(local_files['file_id'])
# merged_df[merged_df['file_exists'] == True]

In [5]:
merged_df = get_image_sizes(merged_df)

Getting image sizes: 100%|██████████| 338/338 [00:00<00:00, 596.63it/s]


In [6]:
merged_df.columns = merged_df.columns.str.replace(".", "_")

### Explore Image Sizes

In [7]:
width_df = merged_df.width.value_counts().reset_index().rename(columns={'index': 'width', 'width': 'counts'})
width_df['percentage'] = round(width_df['counts'] / width_df['counts'].sum() * 100)

width_df.sort_values(by='counts', ascending=False).head(10)

Unnamed: 0,width,counts,percentage
0,640.0,38066,97.0
1,570.0,25,0.0
2,580.0,20,0.0
4,565.0,19,0.0
3,575.0,19,0.0
5,568.0,18,0.0
6,577.0,18,0.0
10,588.0,17,0.0
11,592.0,17,0.0
12,579.0,17,0.0


In [8]:
f"In total {len(merged_df[merged_df.width != 640.0])} images have a different width than 640, while 97% of the images have a width of 640."

'In total 1367 images have a different width than 640, while 97% of the images have a width of 640.'

In [9]:
height_df = merged_df[merged_df.width == 640.0].height.value_counts().reset_index().rename(columns={'index': 'height', 'height': 'counts'})

In [10]:
height_df[height_df.height < 350].describe()

Unnamed: 0,height,counts
count,56.0,56.0
mean,321.285714,628.589286
std,16.678472,1081.790952
min,289.0,1.0
25%,307.75,27.0
50%,321.5,171.5
75%,335.25,580.25
max,349.0,4953.0


In [11]:
chart = alt.Chart(height_df).mark_bar().encode(
    x='height',
    y='counts'
).properties(
    title='Distribution of image heights of images with width 640'
)

chart1 = alt.Chart(height_df[height_df.height < 350]).mark_bar().encode(
    x='height',
    y='counts'
).properties(
    title='Distribution of image heights of images with width 640 and height < 350'
)
rule = alt.Chart(height_df[height_df.height < 350]).mark_rule(color='red').encode(
    x='mean(height)'
)
chart1 = chart1 + rule
chart | chart1

If we need to make images the same size would recommend going with a crop of 320 * 640 based on this distribution (or could try resizing but might loose too much info)

### Identify Potential Data Columns of Interest

In [12]:
unique_vals = merged_df.apply(pd.Series.nunique).reset_index().rename(columns={'index':'column_name', 0:'unique_values'})
unique_vals['total_percent'] = unique_vals['unique_values'] / len(merged_df) * 100


In [13]:
unique_vals['category'] = pd.qcut(unique_vals['total_percent'], 3, labels=["too few", "just right", "too many"])

In [14]:
chart = alt.Chart(unique_vals[unique_vals.total_percent < 10]).mark_bar().encode(
    y=alt.Y('column_name', sort='-x'),
    x='total_percent',
    color='category'
).properties(
    title='Unique Values per Column'
)
chart

In [15]:
# chart.save('process_notes/total_unique_values.png', scale_factor=2.0)

In [16]:
unique_vals[unique_vals.category == 'just right'].sort_values('column_name')

Unnamed: 0,column_name,unique_values,total_percent,category
109,cleaned_contributor,507,1.285725,just right
107,cleaned_date,998,2.530875,just right
3,contributor,746,1.891816,just right
4,coordinates,521,1.321228,just right
5,date,171,0.433647,just right
75,dates,150,0.380392,just right
6,description,5386,13.658611,just right
8,extract_timestamp,648,1.643294,just right
108,final_date,998,2.530875,just right
106,height,252,0.639059,just right


In [17]:
final_cols = unique_vals[unique_vals.category == 'just right'].column_name.to_list() + ['locations', 'item_contributor_names', 'item_source_created', 'item_sort_date']

In [18]:
id_cols = ['split_id', 'file_id','MD5_hash','data_location','image_filename', 'id']
cols = id_cols + final_cols
# merged_df[cols]

### Explore Dates

In [19]:
merged_df = get_dates(merged_df, 'private_materials/datajam_data/initial_cleaned_sample_dataset.csv')
merged_df['cleaned_date'] = pd.to_datetime(merged_df['cleaned_date'], format='%Y-%m-%d',    errors='coerce')

In [20]:
grouped_df = merged_df.groupby(['id','cleaned_date']).size().reset_index(name='counts')

In [21]:
grouped_df.groupby(grouped_df.cleaned_date.dt.year).counts.sum().reset_index().describe()

Unnamed: 0,cleaned_date,counts
count,76.0,76.0
mean,1888.092105,518.855263
std,23.089494,821.94267
min,1830.0,1.0
25%,1869.75,55.5
50%,1888.5,171.0
75%,1907.25,431.75
max,1928.0,3268.0


In [22]:
chart = alt.Chart(grouped_df).mark_bar().encode(
    x=alt.X('cleaned_date:T', title='Date'),
    y=alt.Y('sum(counts):Q', title='Count')
).properties(
    title='Number of Items by Date'
)
rule = alt.Chart(grouped_df).mark_rule(color='red').encode(
    x='mean(cleaned_date)'
)

chart = chart + rule
chart

In [23]:
year_counts = grouped_df.groupby(grouped_df.cleaned_date.dt.year)['counts'].sum().reset_index().sort_values('counts', ascending=False)
year_counts

top_year_1900 = year_counts[year_counts.cleaned_date >= 1900].sort_values('counts', ascending=False).head(1)
top_year_pre1900 = year_counts[year_counts.cleaned_date < 1900].sort_values('counts', ascending=False).head(1)
print(f"Top year pre 1900: {top_year_pre1900.cleaned_date.values[0]} with {top_year_pre1900.counts.values[0]} images")
print(f"Top year post 1900: {top_year_1900.cleaned_date.values[0]} with {top_year_1900.counts.values[0]} images")

Top year pre 1900: 1860 with 2251 images
Top year post 1900: 1903 with 3268 images


Looks like the average is somewhere around 1890 and can see that there are two clear spikes. These could potentially make interesting classes though it's unclear how much variation there is within and between years. Would be interesting to know though if there are differences...

### Explore Contributors

In [24]:
merged_df = get_contributors(merged_df, 'private_materials/datajam_data/initial_cleaned_sample_dataset.csv')

In [163]:
contributor_cols = [col for col in merged_df.columns.tolist() if 'contributor' in col]
subset_contributors = merged_df[['id'] + contributor_cols]

In [171]:
subset_contributors[(subset_contributors.contributor == '[]') & (subset_contributors.item_contributor_names.notna())]

Unnamed: 0,id,contributor,item_contributors,item_contributor_names,cleaned_contributor,contributor_exists


In [25]:
cleaned_contributor_counts = merged_df.groupby(['cleaned_contributor', merged_df.cleaned_date]).size().reset_index(name='counts')

In [26]:
alt.Chart(cleaned_contributor_counts[cleaned_contributor_counts.counts > 20]).mark_bar().encode(
    x=alt.X('counts:Q', title='Count'),
    y=alt.Y('cleaned_contributor:N', title='Contributor', sort='-x'),
    color = alt.Color('year(cleaned_date):N', title='Year', scale=alt.Scale(scheme='plasma'), legend=alt.Legend(symbolLimit=0))
).properties(
    title='Number of Items by Contributor'
)

In [27]:
exploded_df = merged_df[['id', 'contributor']].explode('contributor')
exploded_df = exploded_df[exploded_df.contributor.notna()]
total_existing_contributors = len(exploded_df.contributor.unique().tolist())

In [28]:
print(f"Initially tried to collapse some of the contributors into similar entities using fuzzy matching and some rules, so went from {total_existing_contributors} to {len(cleaned_contributor_counts.cleaned_contributor.unique().tolist())} unique contributors. And of those unique ones, only {len(cleaned_contributor_counts[cleaned_contributor_counts.counts > 20].cleaned_contributor.unique().tolist())} have more than 20 images in the collections")
total_with_contributors = len(merged_df[merged_df.cleaned_contributor.notna()])
total_without_contributors = len(merged_df[merged_df.cleaned_contributor.isna()])
percentage = round(total_with_contributors/len(merged_df) * 100)
print(f"Also notably only {percentage}% or {total_with_contributors} images have known contributors. The rest of {total_without_contributors} images have no listed contributor.")

Initially tried to collapse some of the contributors into similar entities using fuzzy matching and some rules, so went from 746 to 507 unique contributors. And of those unique ones, only 77 have more than 20 images in the collections
Also notably only 59% or 23450 images have known contributors. The rest of 15983 images have no listed contributor.


Could potentially focus in on a few of these contributors to see if there are any interesting patterns. Alternatively could try and predict the contributor based on other features.

In [29]:
merged_df['contributor_exists'] = False
merged_df.loc[merged_df.cleaned_contributor.notna(), 'contributor_exists'] = True

In [30]:
grouped_contributors_counts = merged_df.groupby(['contributor_exists', merged_df.cleaned_date]).size().reset_index(name='counts')

In [31]:
alt.Chart(grouped_contributors_counts).mark_bar().encode(
    y=alt.Y('sum(counts):Q', title='Count'),
    x=alt.X('year(cleaned_date):T'),
    color = alt.Color('contributor_exists:O', title='Has Contributor?'),
).properties(
    title='Number of Items by Contributor'
)

In [32]:
pivoted_contributor_counts = pd.pivot(grouped_contributors_counts, index='cleaned_date', columns='contributor_exists', values='counts').reset_index()
pivoted_contributor_counts = pivoted_contributor_counts.rename(columns={False: 'No Contributor', True: 'Contributor'})
pivoted_contributor_counts = pivoted_contributor_counts.fillna(0)

In [33]:
years = pivoted_contributor_counts.cleaned_date.dt.year.unique().tolist()
years.sort()

In [34]:
base = alt.Chart(pivoted_contributor_counts).mark_point().encode(
    x=alt.X('No Contributor:Q', title='No Contributor'),
    y=alt.Y('Contributor:Q', title='Contributor'),
).properties(
    title='Correlation Between Contributor and No Contributor Over Time'
)

fig = base.encode(
  color = alt.Color('year(cleaned_date):O', title='Year', scale=alt.Scale(scheme='plasma'), legend=alt.Legend(symbolLimit=0, columns=4), sort=years)
).mark_circle()

fit = base.transform_regression('No Contributor', 'Contributor').mark_line()

fit+fig

So seems like the likelihood of having a contributor or not is consistent with just number of cards in the collection. That means that either the historical cataloguing process was indescriminate or that contributors may not be a sign of change over time in the collection -- probably both.

### Explore Descriptions

In [35]:
merged_df.description = merged_df.description.apply(literal_eval)

In [36]:
merged_df['cleaned_description'] = merged_df.description.str[0]

In [37]:
merged_df['cleaned_description_len'] = merged_df.cleaned_description.str.split().str.len()

In [38]:
grouped_description_len = merged_df.groupby(['cleaned_description_len', 'cleaned_date', 'contributor_exists']).size().reset_index(name='counts')

In [39]:
alt.Chart(grouped_description_len).mark_bar().encode(
    # y='count(cleaned_description_len):Q',
    y=alt.Y('cleaned_description_len:Q', title='Length of Description'),
    # size=alt.Size('sum(counts):Q', title='Count'),
    x='count(counts):Q',
    row='contributor_exists',
    color = alt.Color('year(cleaned_date):N', title='Length', scale=alt.Scale(scheme='plasma'), legend=alt.Legend(symbolLimit=0, columns=4))
)


In [40]:
pivoted_description = pd.pivot(grouped_description_len, index=['cleaned_description_len', 'cleaned_date'], columns='contributor_exists', values='counts').reset_index()
pivoted_description = pivoted_description.rename(columns={False: 'No Contributor', True: 'Contributor'})
pivoted_description = pivoted_description.fillna(0)


In [41]:
base = alt.Chart(pivoted_description).encode(
    x=alt.X('No Contributor:Q', title='No Contributor'),
    y=alt.Y('Contributor:Q', title='Contributor'),
).properties(
    title='Correlation Between Contributor and No Contributor Over Time'
)

fig = base.encode(
    color = alt.Color('year(cleaned_date):O', title='Year', scale=alt.Scale(scheme='plasma'), legend=alt.Legend(symbolLimit=0, columns=4), sort=years),
    size = alt.Size('cleaned_description_len:Q', title='Length of Description')
).mark_circle()

fit = base.transform_regression('No Contributor', 'Contributor').mark_line()
fig + fit


Again doesn't seem like there's a correlation in description length between the two classes. This is probably because the descriptions are not very long to begin with, but might also be because this is just historical artefact.

In [42]:
grouped_description_len['time_class'] = 'pre_1880'
grouped_description_len.loc[grouped_description_len.cleaned_date.dt.year >= 1880, 'time_class'] = 'post_1880'

In [43]:
time_description = grouped_description_len[['time_class', 'cleaned_description_len', 'counts']].groupby(['time_class', 'cleaned_description_len'])['counts'].sum().reset_index()

In [44]:
pivoted_time_description = pd.pivot(time_description, index='cleaned_description_len', columns='time_class', values='counts').reset_index()
pivoted_time_description = pivoted_time_description.fillna(0)

In [45]:
base = alt.Chart(pivoted_time_description).encode(
    x=alt.X('pre_1880:Q', title='Pre 1880'),
    y=alt.Y('post_1880:Q', title='Post 1880'),
).properties(
    title='Correlation Between Description Length Over Time'
)

fig = base.encode(
    color = alt.Color('cleaned_description_len:O', title='Length of Description', scale=alt.Scale(scheme='plasma'), legend=alt.Legend(symbolLimit=0, columns=4))
).mark_circle()

fit = base.transform_regression('pre_1880', 'post_1880').mark_line()
fig + fit

#### Explore Text Features of Description

In [94]:
merged_df['stripped_description'] = merged_df.cleaned_description.str.replace('[^\w\s]','')
merged_df['stripped_description'] = merged_df.stripped_description.str.lower()

In [95]:
description_vocabulary = get_count_vectorizer(merged_df,'stripped_description', 10000)

In [96]:
description_vocabulary.head(10)

Unnamed: 0,word,count
3955,mount,48877
5672,stereograph,40362
4687,print,38860
4465,photograph,32244
1549,cm,31441
2560,format,30795
102,18,29919
1293,card,26107
5670,stereo,8263
4469,photographic,8083


In [97]:
alt.Chart(description_vocabulary.head(50)).mark_bar().encode(
    y=alt.Y('word:O', sort='-x'),
    x='count:Q',
)

In [98]:
contributor_vocabulary = get_count_vectorizer(merged_df[merged_df.contributor_exists == True],'stripped_description', 10000)
contributor_vocabulary['contributor_exists'] = True
no_contributor_vocabulary = get_count_vectorizer(merged_df[merged_df.contributor_exists == False],'stripped_description', 10000)
no_contributor_vocabulary['contributor_exists'] = False
vocab_df = pd.concat([contributor_vocabulary, no_contributor_vocabulary])

In [99]:
pivoted_vocab = pd.pivot(vocab_df, index='word', columns='contributor_exists', values='count').fillna(0).rename(columns={True: 'Contributor', False: 'No Contributor'}).reset_index()
pivoted_vocab['Contributor_scaled'] = pivoted_vocab.Contributor / pivoted_vocab.Contributor.sum()
pivoted_vocab['No_Contributor_scaled'] = pivoted_vocab['No Contributor'] / pivoted_vocab['No Contributor'].sum()
pivoted_vocab['diff'] = abs(pivoted_vocab['Contributor_scaled'] - pivoted_vocab['No_Contributor_scaled'])

In [100]:
alt.Chart(pivoted_vocab.sort_values(by='diff', ascending=False)[0:100]).mark_text().encode(
    x=alt.X('Contributor:Q', title='Contributor'),
    y=alt.Y('No Contributor:Q', title='No Contributor'),
    text='word:O',
).properties(
    title='Frequency of Contributor vs No Contributor Vocabulary'
)

In [101]:
tfidf_contributor_vocabulary = get_tfidf_vectorizer(merged_df[(merged_df.contributor_exists == True) & (merged_df.cleaned_date < '1888-01-01')],'stripped_description', 1000)
tfidf_no_contributor_vocabulary = get_tfidf_vectorizer(merged_df[(merged_df.contributor_exists == False) & (merged_df.cleaned_date < '1888-01-01')],'stripped_description', 1000)

Getting TFIDF Scores: 100%|██████████| 5715/5715 [00:14<00:00, 394.16it/s]
Getting TFIDF Scores: 100%|██████████| 2230/2230 [00:04<00:00, 557.44it/s]


In [76]:
tfidf_contributor_vocabulary.sort_values(by='score', ascending=False).head(10)

Unnamed: 0,term,score,id
0,turret,32.166347,http://www.loc.gov/item/2015648199/
0,heavy,31.43706,http://www.loc.gov/item/2015647559/
0,wallack,24.12476,http://www.loc.gov/item/2017647791/
0,wallack,24.12476,http://www.loc.gov/item/2017647790/
0,wallack,24.12476,http://www.loc.gov/item/2017647792/
0,wallack,24.12476,http://www.loc.gov/item/2017647793/
0,iron,23.115343,http://www.loc.gov/item/2009631320/
0,caption,23.115343,http://www.loc.gov/item/2004674579/
0,iron,23.115343,http://www.loc.gov/item/2017647773/
0,1864,23.036817,http://www.loc.gov/item/2015647116/


In [77]:
tfidf_no_contributor_vocabulary.sort_values(by='score', ascending=False).head(20)

Unnamed: 0,term,score,id
0,oil,38.057965,http://www.loc.gov/item/2008677187/
0,railroad,24.284591,http://www.loc.gov/item/2016646722/
3,secret service,24.051174,http://www.loc.gov/item/2011646151/
1,service,24.051174,http://www.loc.gov/item/2011646151/
2,secret,24.051174,http://www.loc.gov/item/2011646151/
1,oil co,24.051174,http://www.loc.gov/item/2008677187/
0,operator,24.051174,http://www.loc.gov/item/2011646151/
2,co,21.236031,http://www.loc.gov/item/2008677187/
0,table,20.755337,http://www.loc.gov/item/90710778/
0,charles,19.892291,http://www.loc.gov/item/2015649036/


### Explore Subject Headings

In [47]:
subject_df = merged_df[['id', 'item_subject_headings', 'subject', 'item_genre', 'cleaned_date', 'final_date', 'cleaned_contributor', 'cleaned_description', 'stripped_description', 'contributor_exists']]

In [48]:
subject_df.fillna('[]', inplace=True)
subject_df.loc[subject_df.subject == 'nan', 'subject'] = '[]'

In [49]:
subject_df['item_subject_headings'] = subject_df['item_subject_headings'].apply(literal_eval)
subject_df['subject'] = subject_df['subject'].apply(literal_eval)
subject_df['item_genre'] = subject_df['item_genre'].apply(literal_eval)

In [50]:
exploded_subject_df = subject_df.explode('subject')
exploded_heading_df = subject_df.explode('item_subject_headings')
exploded_genre_df = subject_df.explode('item_genre')

In [51]:
len(exploded_subject_df['subject'].unique().tolist()), len(exploded_heading_df['item_subject_headings'].unique().tolist()), len(exploded_genre_df['item_genre'].unique().tolist())
# Checked and almost everything in subject headings and genre is in subject so will focus on that

(6053, 20803, 421)

In [52]:
exploded_subject_df['subject'] = exploded_subject_df['subject'].str.replace('[^\w\s]','')

In [53]:
f"Of the {len(exploded_subject_df['subject'].unique().tolist())} unique subjects, we have {len(exploded_subject_df[exploded_subject_df.subject.notna()])} occurences with {len(exploded_subject_df[exploded_subject_df.subject.isna()])} missing values"

'Of the 6004 unique subjects, we have 168407 occurences with 2034 missing values'

In [54]:
cleaned_subject = exploded_subject_df[exploded_subject_df.subject.notna()]
cleaned_subject = cleaned_subject[cleaned_subject.subject.str.len() > 2]

In [55]:
unique_subjects = get_spacy_subjects(cleaned_subject)

Getting NER labels: 100%|██████████| 5995/5995 [07:35<00:00, 13.15it/s]


In [57]:
unique_subjects.to_csv('private_materials/datajam_data/unique_subjects.csv', index=False)

In [59]:
unique_subjects = unique_subjects.rename(columns={'unique_subject': 'subject'})

In [61]:
cleaned_subject = cleaned_subject.merge(unique_subjects, on='subject', how='left')

In [79]:
cleaned_subject.loc[cleaned_subject.spacy_label.isna(), 'spacy_label'] = 'None Identified'

In [106]:
grouped_spacy = cleaned_subject.groupby(['subject', 'spacy_label', 'contributor_exists']).size().reset_index(name='counts')

In [112]:
pivoted_spacy = pd.pivot(grouped_spacy.groupby(['spacy_label', 'contributor_exists']).counts.sum().reset_index(), index='spacy_label', columns='contributor_exists', values='counts').fillna(0).rename(columns={True: 'Contributor', False: 'No Contributor'}).reset_index()

In [116]:
base = alt.Chart(pivoted_spacy[pivoted_spacy.spacy_label != 'None Identified']).encode(
    x=alt.X('Contributor:Q', title='Contributor'),
    y=alt.Y('No Contributor:Q', title='No Contributor'),
)

fig = base.encode(
    color=alt.Color('spacy_label:N', legend=alt.Legend(title='Spacy Label')),
    text='spacy_label:N'
).mark_circle()

fit = base.transform_regression('No Contributor', 'Contributor').mark_line()

fig + fit


In [117]:
grouped_subject = cleaned_subject.groupby(['subject', 'spacy_label', 'cleaned_date']).size().reset_index(name='counts')

In [124]:
grouped_spacy_date = grouped_subject.groupby(['spacy_label', 'cleaned_date']).counts.sum().reset_index()

chart = alt.Chart(grouped_spacy_date[grouped_spacy_date.spacy_label != 'None Identified']).mark_bar().encode(
    x=alt.X('cleaned_date:T', title='Date'),
    y=alt.Y('counts:Q', title='Count'),
    color=alt.Color('spacy_label:N', legend=alt.Legend(title='Spacy Label')),
)

chart1 = alt.Chart(grouped_spacy_date).mark_bar().encode(
    x=alt.X('cleaned_date:T', title='Date'),
    y=alt.Y('counts:Q', title='Count'),
    color=alt.Color('spacy_label:N', legend=alt.Legend(title='Spacy Label')),
)

alt.vconcat(chart, chart1)

In [148]:
subject_dates = grouped_subject.groupby(['subject', 'cleaned_date']).counts.sum().reset_index()
subject_totals = grouped_subject.groupby(['subject']).counts.sum().reset_index()
subject_totals = subject_totals.sort_values(by='counts', ascending=False)
subject_totals = subject_totals[2:50]


In [149]:
images_with_top_subjects = cleaned_subject[cleaned_subject.subject.isin(subject_totals.subject.tolist())]
images_ids = images_with_top_subjects.id.unique().tolist()
len(images_ids), len(merged_df[~merged_df.id.isin(images_ids)].id.unique())

(20852, 18578)

In [151]:
images_with_top_subjects_contributors = cleaned_subject[(cleaned_subject.subject.isin(subject_totals.subject.tolist())) & (cleaned_subject.contributor_exists == True)]
images_with_top_subjects_no_contributors = cleaned_subject[(cleaned_subject.subject.isin(subject_totals.subject.tolist())) & (cleaned_subject.contributor_exists == False)]
images_contributors_ids = images_with_top_subjects_contributors.id.unique().tolist()
images_no_contributors_ids = images_with_top_subjects_no_contributors.id.unique().tolist()
len(images_contributors_ids), len(images_no_contributors_ids), len(merged_df[(~merged_df.id.isin(images_contributors_ids)) & (~merged_df.id.isin(images_no_contributors_ids))].id.unique())

(16953, 3899, 18578)

In [152]:
images_with_top_subjects_pre1888 = cleaned_subject[(cleaned_subject.subject.isin(subject_totals.subject.tolist())) & (cleaned_subject.cleaned_date < '1888-01-01')]
images_with_top_subjects_post1888 = cleaned_subject[(cleaned_subject.subject.isin(subject_totals.subject.tolist())) & (cleaned_subject.cleaned_date > '1888-01-01')]
images_pre1888_ids = images_with_top_subjects_pre1888.id.unique().tolist()
images_post1888_ids = images_with_top_subjects_post1888.id.unique().tolist()
len(images_pre1888_ids), len(images_post1888_ids), len(merged_df[(~merged_df.id.isin(images_pre1888_ids)) & (~merged_df.id.isin(images_post1888_ids))].id.unique())

(6164, 14679, 18587)

In [159]:
def compare_subject_lists(subject_list1, subject_list2):
    diff = []
    subjects1 = set(subject_list1)
    subjects2 = set(subject_list2)
    diff.append(subjects1.difference(subjects2))
    diff.append(subjects2.difference(subjects1))
    return diff

In [160]:
import itertools

list_subjects = [images_with_top_subjects_pre1888.subject.unique().tolist(), images_with_top_subjects_post1888.subject.unique().tolist(), images_with_top_subjects_contributors.subject.unique().tolist(), images_with_top_subjects_no_contributors.subject.unique().tolist()]

combos = itertools.combinations(list_subjects, 2)

for combo in combos:
    print(compare_subject_lists(combo[0], combo[1]))

[set(), {'sweden', 'louisiana purchase exposition', 'roosevelt theodore'}]
[set(), {'sweden', 'louisiana purchase exposition', 'roosevelt theodore'}]
[{'jamaica'}, {'sweden', 'louisiana purchase exposition', 'roosevelt theodore'}]
[set(), set()]
[{'jamaica'}, set()]
[{'jamaica'}, set()]


In [157]:
set(images_with_top_subjects_pre1888.subject.unique()) - set(images_with_top_subjects_post1888.subject.unique())

set()

In [135]:
alt.Chart(subject_dates[subject_dates.subject.isin(subject_totals.subject.tolist())]).mark_bar(size=2).encode(
    x=alt.X('cleaned_date:T', title='Date'),
    y=alt.Y('counts:Q', title='Count'),
    color=alt.Color('subject:N', legend=alt.Legend(title='Subject')),
    row=alt.Row('subject:N', title='Subject')
).properties(
    height=30,
)


In [173]:
from thefuzz import fuzz

In [178]:
subject_combos = list(itertools.combinations(subject_totals.subject.tolist(), 2))

In [186]:
matches = []
for c in tqdm(subject_combos, desc="Getting matches", total=len(subject_combos)):
    ratio = fuzz.ratio(c[0], c[1])
    if ratio > 70:
        matches.append(pd.DataFrame([{'official_subject':c[0], 'alternative_subject':c[1], 'ratio':ratio}]))
matches_df = pd.concat(matches)

Getting matches: 100%|██████████| 1128/1128 [00:00<00:00, 16008.36it/s]


In [187]:
matches_df

Unnamed: 0,official_subject,alternative_subject,ratio
0,new york state,new york,73
0,new york state,new york ny,72
0,new york,new york ny,84
0,saint louis mo,saint louis,88
0,exhibition buildings,exhibitions,71


In [270]:
subject_counts = cleaned_subject.groupby(['id', 'subject']).size().reset_index(name='counts')
subject_counts

Unnamed: 0,id,subject,counts
0,http://www.loc.gov/item/00649686/,african americans,1
1,http://www.loc.gov/item/00649686/,group portraits,1
2,http://www.loc.gov/item/00649686/,horses,1
3,http://www.loc.gov/item/00649686/,north carolina,1
4,http://www.loc.gov/item/00649686/,photographic prints,1
...,...,...,...
167202,http://www.loc.gov/item/99472779/,galata istanbul turkey,1
167203,http://www.loc.gov/item/99472779/,istanbul,1
167204,http://www.loc.gov/item/99472779/,photographic prints,1
167205,http://www.loc.gov/item/99472779/,stereographs,1


In [272]:
pivoted_subjects = pd.pivot(subject_counts, index='id', columns='subject', values='counts').fillna(0)

In [273]:
df_asint = pivoted_subjects.astype(int)

In [274]:
coocc = df_asint.T.dot(df_asint)

In [None]:
final_test

### Explore Geogrpahic Locations

In [188]:
geographic_cols = [ 'locations', 'item_place','item_location','latlong','location','lonlat','coordinates']

In [190]:
geo_df = merged_df[['id', 'image_error', 'width', 'height', 'cleaned_date', 'final_date',
       'cleaned_contributor', 'contributor_exists', 'cleaned_description',
       'cleaned_description_len', 'stripped_description', 'subject'] + geographic_cols]

In [195]:
no_geo_info = geo_df[(geo_df.locations.isna())& (geo_df.item_place.isna()) & (geo_df.item_location.isna()) & (geo_df.latlong.isna()) & (geo_df.location.isna()) & (geo_df.lonlat.isna()) & (geo_df.coordinates.isna())]
subset_geo_df = geo_df[geo_df.id.isin(no_geo_info.id) == False]

In [241]:
print(f"Total cards with geographic information: {len(subset_geo_df)} versus {len(no_geo_info)} without")

Total cards with geographic information: 27576 versus 11857 without


In [197]:
subset_geo_df['location'] = subset_geo_df['location'].fillna('[]')
subset_geo_df['location'] = subset_geo_df['location'].apply(literal_eval)

In [198]:
exploded_locations = subset_geo_df.explode('location')

In [199]:
print(f"Identified {len(exploded_locations.location.unique())} unique locations")

Identified 1936 unique locations


In [213]:
exploded_locations['item_location'] = exploded_locations['item_location'].fillna('[]')
exploded_locations['item_location'] = exploded_locations['item_location'].apply(literal_eval)

In [217]:
exploded_locations['item_location'] = exploded_locations['item_location'].str[0]

In [219]:
exploded_locations.loc[exploded_locations.location.isna(), 'location'] = exploded_locations.loc[exploded_locations.location.isna(), 'item_location'] 

In [220]:
unique_locations = exploded_locations.location.unique().tolist()

In [223]:
subject_combos = list(itertools.combinations(unique_locations, 2))
matches = []
for c in tqdm(subject_combos, desc="Getting matches", total=len(subject_combos)):
    ratio = fuzz.ratio(str(c[0]), str(c[1]))
    if ratio > 70:
        matches.append(pd.DataFrame([{'official_location':str(c[0]), 'alternative_location':str(c[1]), 'ratio':ratio}]))
matches_df = pd.concat(matches)

Getting matches: 100%|██████████| 1876953/1876953 [01:11<00:00, 26212.76it/s]


In [224]:
matches_df

Unnamed: 0,official_location,alternative_location,ratio
0,south carolina,north carolina,86
0,charleston,galveston,74
0,charleston,hazleton,78
0,charleston,charleston harbor,74
0,charleston,cresson,71
...,...,...,...
0,highland falls,glens falls,72
0,ṣaqqārah,saqqārah,88
0,gatun dam,gatun lake,74
0,adana ili,aydın ili,78


In [230]:
print(f"Of those only {len(exploded_locations[exploded_locations.coordinates.notna()].location.unique())} unique locations have identified coordinates")

Of those only 681 unique locations have identified coordinates


In [226]:
exploded_locations['coordinates_exist'] = False
exploded_locations.loc[exploded_locations.coordinates.notna(), 'coordinates_exist'] = True

In [234]:
grouped_locations = exploded_locations.groupby(['location', 'coordinates_exist']).size().reset_index().rename(columns={0:'count'}).sort_values('count', ascending=False)

In [236]:
alt.Chart(grouped_locations[0:30]).mark_bar().encode(
    x=alt.X('count:Q', title='Count'),
    y=alt.Y('location:N', title='Location', sort='-x'),
    color=alt.Color('coordinates_exist:N', legend=alt.Legend(title='Coordinates Exist')),
).properties(
    height=300,
)

In [239]:
subset_contributors_geo = exploded_locations[(exploded_locations.coordinates_exist == True) & (exploded_locations.contributor_exists == True)]
subset_no_contributors_geo = exploded_locations[(exploded_locations.coordinates_exist == True) & (exploded_locations.contributor_exists == False)]

In [243]:
print(f"Total cards with geographic coordinates and contributor: {len(subset_contributors_geo)} versus {len(subset_no_contributors_geo)} without")

Total cards with geographic coordinates and contributor: 6661 versus 1102 without


In [246]:
coords_geo_df = subset_geo_df[subset_geo_df.coordinates.notna()]

In [247]:
coords_geo_df['latlong'] = coords_geo_df.latlong.apply(literal_eval)
coords_geo_df['lonlat'] = coords_geo_df.lonlat.apply(literal_eval)


In [248]:
coords_geo_df['latitude'] = coords_geo_df.latlong.apply(lambda x: x[0] if len(x) > 0 else None)
coords_geo_df['longitude'] = coords_geo_df.latlong.apply(lambda x: x[1] if len(x) > 0 else None)

In [249]:
coords_geo_df['datetime'] = pd.to_datetime(coords_geo_df.cleaned_date, utc=True)

In [251]:
coords_geo_df.to_csv('test_geo.csv', index=False)

In [263]:
gdf = gpd.GeoDataFrame(
    coords_geo_df, geometry=gpd.points_from_xy(coords_geo_df.longitude, coords_geo_df.latitude))

In [264]:

m = leafmap.Map(center=[40, -100], zoom=2, height=600, widescreen=False)


In [265]:
m.add_gdf(gdf, layer_name="Cards", config="config.json")

In [267]:
# m

In [262]:
m.save_config("config.json")

### Final Feature Exploration

subset_contributors_geo