In [1]:
import pandas as pd
import numpy as np
import ollama
from names_dataset import NameDataset
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import thread_map

In [4]:
len(ollama.embeddings(model='nomic-embed-text', prompt=f"Name: Peter")['embedding'])

768

In [2]:
nd = NameDataset()
names = nd.first_names

In [3]:
df = pd.DataFrame(index=names.keys())

# english characters only
df = df[df.index.str.contains(r'^[a-zA-Z\']+$', regex=True)].copy()

In [4]:
# # short list
# df = df.sample(1000)

In [5]:
# genders
df_gender = pd.DataFrame(index=df.index)
df_gender['gender_F'] = 0.0
df_gender['gender_M'] = 0.0

# assign values from dataset
for name in tqdm(df.index):
    meta = names[name]
    for g, v in meta['gender'].items():
        df_gender.loc[name, f"gender_{g}"] = v

  0%|          | 0/416880 [00:00<?, ?it/s]

In [6]:
# full list of countries
countries = set()
for name in df.index:
    countries.update(names[name]['country'].keys())
countries = list(countries)
df_countries = pd.DataFrame({c: [0.0] * len(df) for c in countries}, index=df.index)

# assign values from dataset
for name in tqdm(df.index):
    meta = names[name]
    for c, v in meta['country'].items():
        df_countries.loc[name, c] = v

  0%|          | 0/416880 [00:00<?, ?it/s]

In [7]:
embed = lambda name: np.array(ollama.embeddings(model='nomic-embed-text', prompt=f"Name: {name}")['embedding'])
embeddings = np.stack(thread_map(embed, df.index))
df_emb = pd.DataFrame(embeddings, columns=[f"dim_{i}" for i in range(embeddings.shape[1])], index=df.index)

  0%|          | 0/416880 [00:00<?, ?it/s]

In [8]:
df_combined = pd.concat((df, df_gender, df_countries, df_emb), axis=1)

In [9]:
df_combined.to_pickle("baby_names_full.pkl")