This notebook collects and consolidates multiple wordlists and named-entity sources into two pickled dictionaries:
- `artist_wordlist.pkl` — words and names related to the art world (artists, institutions).
- `ner_wordlist.pkl` — words extracted from NER-tagged corpora.

Overview:
1. Load multiple external sources (CSV, JSON, text files, HF datasets).
2. Normalize entries (lowercasing, splitting multiword names where useful).
3. Extract entities from NER-tagged datasets and other curated sources.
4. Save consolidated sets to `Dictionary_data/` for use in downstream pipelines.

Notes and usage:
- Data sources are loaded from `DICTIONARY_DATA_DIR` — change that variable to point to your data folder.
- The notebook favors conservative, rule-based extraction; inspect intermediate prints if any source contains unexpected formatting.
- When adding new sources, follow existing patterns: load, normalize to lowercase with `to_lower()`, and `update()` the appropriate sets.


In [None]:
import pandas as pd
import pickle

In [None]:
DICTIONARY_DATA_DIR = 'Dictionary_data/'
NER_DIR = DICTIONARY_DATA_DIR + 'ner_dictionary/'
ARTWORLD_WORDLIST = set()
NER_WORDLIST = set()

In [None]:
def load_csv(file_name, pd_args=None):
    """
    Attempt to read a CSV using several common encodings.
    Returns a pandas DataFrame on success or raises ValueError.
    pd_args can contain extra arguments to pass to pd.read_csv (e.g., on_bad_lines).
    """
    encodings_to_try = ['utf-8', 'Latin-1', 'ISO-8859-1']
    
    for encoding in encodings_to_try:
        try:
            df = pd.read_csv(DICTIONARY_DATA_DIR + file_name, encoding=encoding, **(pd_args or {}))
            print("File read successfully with encoding:", encoding)
            print(df.head()) # quick sanity-check to see columns and first rows
            return df
        except UnicodeDecodeError:
            # try the next encoding
            pass
    raise ValueError("Failed to read the file with the provided encodings.")

In [1]:
def to_lower(word):
    """
    Convert value to string and lowercase it.
    Simpler and more robust than character-by-character handling.
    Accepts None and other non-str inputs without error.
    """
    result = ""
    for c in str(word):
        if c.isupper():
            result += c.lower()
        else:
            result += c
    return result

In [None]:
# https://www.kaggle.com/datasets/rafsunahmad/popular-quotes-author-classifier?resource=download
file_name = 'Autor_detection.csv'
df_author = load_csv(file_name)
writers = set(df_author.loc[:]['Writter name'].apply(lambda x: to_lower(x)))
print(len(writers))
ARTWORLD_WORDLIST.update(writers)

File read successfully with encoding: utf-8
                                              Quotes      Writter name
0  Reading Kafka I sense that the elicited questi...  Alberto Manguel,
1  All animals are equal but some animals are mor...     George Orwell
2  I am old Gandalf I dont look it but I am begin...    J.R.R. Tolkien
3  How can we live without our lives How will we ...    John Steinbeck
4  I was only foolin George I dont want no ketchu...    John Steinbeck
83


In [None]:
# https://www.kaggle.com/datasets/joebeachcapital/art-history
file_name = 'artists.csv'
df_artists = load_csv(file_name)
artists = set(df_artists.loc[:]['artist_name'].apply(lambda x: to_lower(x)))
print(len(artists))
ARTWORLD_WORDLIST.update(artists)

File read successfully with encoding: utf-8
     artist_name  edition_number  year artist_nationality  \
0  Aaron Douglas             9.0  1991           American   
1  Aaron Douglas            10.0  1996           American   
2  Aaron Douglas            11.0  2001           American   
3  Aaron Douglas            12.0  2005           American   
4  Aaron Douglas            13.0  2009           American   

  artist_nationality_other artist_gender                artist_race  \
0                 American          Male  Black or African American   
1                 American          Male  Black or African American   
2                 American          Male  Black or African American   
3                 American          Male  Black or African American   
4                 American          Male  Black or African American   

                artist_ethnicity     book  space_ratio_per_page_total  \
0  Not Hispanic or Latino origin  Gardner                    0.353366   
1  Not Hispanic 

In [None]:
# https://www.kaggle.com/datasets/rishidamarla/art-and-artists-from-the-museum-of-modern-art?select=Artworks.csv
file_name = 'Artworks.csv'
df_artworks = load_csv(file_name)
artworks = set(df_artworks.loc[:]['Artist'].apply(lambda x: to_lower(x)))
print(len(artworks))
ARTWORLD_WORDLIST.update(artworks)

File read successfully with encoding: utf-8
                                               Title  \
0  Ferdinandsbrücke Project, Vienna, Austria, Ele...   
1  City of Music, National Superior Conservatory ...   
2  Villa near Vienna Project, Outside Vienna, Aus...   
3  The Manhattan Transcripts Project, New York, N...   
4  Villa, project, outside Vienna, Austria, Exter...   

                     Artist ConstituentID  \
0               Otto Wagner          6210   
1  Christian de Portzamparc          7470   
2                Emil Hoppe          7605   
3           Bernard Tschumi          7056   
4                Emil Hoppe          7605   

                                   ArtistBio Nationality BeginDate EndDate  \
0                      (Austrian, 1841–1918)  (Austrian)    (1841)  (1918)   
1                        (French, born 1944)    (French)    (1944)     (0)   
2                      (Austrian, 1876–1957)  (Austrian)    (1876)  (1957)   
3  (French and Swiss, born Switzerla

In [None]:
# https://www.kaggle.com/datasets/mfrancis23/museum-of-modern-art-collection?select=Artists.csv
file_name = 'Artists.csv'
df_moma_artists = load_csv(file_name)
moma_artists = set(df_moma_artists.loc[:]['DisplayName'].apply(lambda x: to_lower(x)))
print(len(moma_artists))
ARTWORLD_WORDLIST.update(moma_artists)

File read successfully with encoding: utf-8
   ConstituentID      DisplayName            ArtistBio Nationality Gender  \
0              1   Robert Arneson  American, 1930–1992    American   Male   
1              2   Doroteo Arnaiz   Spanish, born 1936     Spanish   Male   
2              3      Bill Arnold  American, born 1941    American   Male   
3              4  Charles Arnoldi  American, born 1946    American   Male   
4              5      Per Arnoldi    Danish, born 1941      Danish   Male   

   BeginDate  EndDate  Wiki QID         ULAN  
0       1930     1992       NaN          NaN  
1       1936        0       NaN          NaN  
2       1941        0       NaN          NaN  
3       1946        0  Q1063584  500027998.0  
4       1941        0       NaN          NaN  
15226


In [191]:
ARTLIST = list(ARTWORLD_WORDLIST)
NEW_ARTLIST = ARTLIST.copy()
for row in ARTLIST:
    NEW_ARTLIST.extend(row.split())

ARTWORLD_WORDLIST = set(NEW_ARTLIST)

In [None]:
# https://www.kaggle.com/datasets/thedevastator/ner-tagged-text-dataset

def extract_entities_from_row(row):
    """
    Parse token and ner_tags columns assumed to contain Python list literal strings.
    Returns lowercased entity words where the corresponding tag != '0'.
    """
    # Assuming 'tokens' and 'ner_tags' columns contain Python lists
    words = [word.strip('[,],\'') for word in row['tokens'].split()]
    tags = [tag.strip('[,]') for tag in row['ner_tags'].split() if len(tag.strip('[,]')) > 0]

    if len(words) != len(tags):
        # Handle rows with length mismatch if necessary
        print(1)
        return []

    # Use a generator for efficient extraction: only keep the word if the tag is not 0
    entities = [to_lower(word) for word, tag in zip(words, tags) if tag != '0']
    return entities


filenames = ['test', 'train', 'validation']
ner_tagged_text = set()

for filename in filenames:
    df = pd.read_csv(NER_DIR + 'archive/' + filename + '.csv')
    
    # --- Execution Steps ---
    # 1. Apply the extraction function row-wise to get a list of entities for each row.
    df['entities'] = df.apply(extract_entities_from_row, axis=1)
    # 2. Flatten the resulting list of lists (the 'entities' column) into a single, long series.
    # 3. Convert the series to a set to get all unique extracted words.
    ner_tagged_text.update(set(df['entities'].explode().dropna()))


In [None]:
# https://www.kaggle.com/datasets/abhinavwalia95/entity-annotated-corpus?resource=download

filename = 'ner.csv'
found_entities = list()

df = pd.read_csv(NER_DIR + filename, encoding='cp1252', on_bad_lines='skip')
words = df['word']
tags  = df['tag']

for word, tag in zip(words, tags):
    if tag in ['B-geo', 'B-org', 'I-per']:
        found_entities.append(to_lower(word))

found_entities = set(found_entities)
NER_WORDLIST.update(found_entities)

In [None]:
# https://huggingface.co/datasets/Babelscape/wikineural
from datasets import load_dataset

ds = load_dataset("Babelscape/wikineural")

multi_lang_set = set()
# iterate over all splits and collect tokens with entity tags
ds.get('test_en')['ner_tags']
for dataset in ds.values():
    words = ds.get('test_en')['tokens']
    tags  = ds.get('test_en')['ner_tags']
    for row_w, row_t in zip(words, tags):
        for w, t in zip(row_w, row_t):
            if 0 < t and t < 7:
                multi_lang_set.add(to_lower(w))
NER_WORDLIST.update(multi_lang_set)

In [None]:
with open(DICTIONARY_DATA_DIR + 'artist_wordlist.pkl', 'wb') as f:
    pickle.dump(ARTWORLD_WORDLIST, f)
    
with open(DICTIONARY_DATA_DIR + 'ner_wordlist.pkl', 'wb') as f:
    pickle.dump(NER_WORDLIST, f)

In [None]:
print(f"Art related words collected: {len(ARTWORLD_WORDLIST)}")
print(f"NER entity words collected: {len(NER_WORDLIST)}")

Total unique words collected: 502183
Art related words collected: 33188
NER entity words collected: 14874
