# Reference line for the trend/bootstrap

In [21]:
import random
import statistics
import pandas as pd
import numpy as np
import gensim
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import matplotlib.pyplot as plt

In [5]:
# define a function for bootstrap
def random_bias_year(old,young,length,simnum,path_output,filename):
    """Parameters:
    old: the word list for older adults
    young: the word list for young people
    length: the number of words in the list of domains (e.g., moral foundation, agiest attitude, etc.)"""
    yearlist = []
    meanlist = []
    lower95list = []
    upper95list = []
    
    models = {year: globals()[f'model{year}'] for year in range(1950, 2022)}
    
    for year in range(1950, 2022):
        model = models[year]
        vocabulary_year = model.wv.index_to_key
        ref_lists = [[vocabulary_year[i] for i in random.sample(range(len(vocabulary_year)), k=length)] for n in range(simnum+1)]
        similarities_A = [[model.wv.similarity(x, y) for x in ref_list for y in old if y in model.wv] for ref_list in ref_lists]
        similarities_B = [[model.wv.similarity(x, z) for x in ref_list for z in young if z in model.wv] for ref_list in ref_lists]
        biases = [statistics.mean(similarities_A[i]) - statistics.mean(similarities_B[i]) for i in range(simnum+1)]
        result = pd.DataFrame({'bias': biases})
        
        yearlist.append(year)
        meanlist.append(result['bias'].mean())
        lower95list.append(result['bias'].quantile(0.025))
        upper95list.append(result['bias'].quantile(0.975))
        
        result.to_csv(f"{path_output}{filename}_{year}.csv", index=False)
    
    result = pd.DataFrame({'year': yearlist, 'mean_similarity': meanlist,
                           '95percent_lower': lower95list, '95percent_upper': upper95list})
    result.to_csv(f"{path_output}{filename}.csv", index=False)
    return result