# Evaluate Model Confidence

Much of this code has been copied and adapted from [Laura Nelson's "measuring_intersectionality" GitHub](https://github.com/lknelson/measuring_intersectionality).

In [1]:
import os, string, warnings, glob, gensim, re, itertools, math
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from random import choices
import scipy.stats as st

import seaborn as sns
import matplotlib.pyplot as plt


# Network libraries.
import networkx as nx
from networkx.algorithms import community
from networkx.readwrite import json_graph


# Ignore warnings.
warnings.simplefilter("ignore")

# Declare directory.
abs_dir = "/Users/williamquinn/Documents/DH/Python/MJP/"



## Read-in Syntactic Dependencies

In [2]:
%%time

df = pd.read_csv(abs_dir + 'Marsden_Magazines/DH2022_LanguageEvolution/Data_Outputs/syntactic-dep-vectors.csv', 
                 sep = '\t')

# Convert target field from strings to list object.
df['target'] = df['target'].astype('str')
df['target'] = df['target'].apply(lambda x: re.sub(r'[\[\]{}""\'\']', '', x))
df['target'] = df['target'].str.split(', ').astype(object)

print (f'Shape of Dataframe: {df.shape}')
df.head(4)

Shape of Dataframe: (4549, 7)
CPU times: user 41.5 ms, sys: 5.74 ms, total: 47.2 ms
Wall time: 52.2 ms


Unnamed: 0,meta_mjp_id,meta_magazine,meta_type,meta_date,meta_year,source,target
0,4,The Little Review,articles,1914-12-01,1914,man,[professor]
1,4,The Little Review,articles,1914-12-01,1914,woman,[age]
2,6,The Little Review,articles,1914-12-01,1914,man,[poem]
3,6,The Little Review,articles,1914-12-01,1914,woman,[ugli]


## Get Semantic Space (top-n similarities) of Keywords

In [3]:
%%time

m = gensim.models.KeyedVectors.load_word2vec_format(abs_dir + '/Word-Doc_Vectors/Models/mjp_w2v.txt')

keyword_dict = {}

for k in df['source'].unique():
    keyword_dict[k] = [w[0] for w in m.most_similar([k], topn=10)]
    
# #     Experiment with removing keywords from other keywords' semantic fields.
#     if ( (k == 'man') or (k == 'men') ):
#         print (k, 'man')
#         keyword_dict[k] = m.most_similar(positive = [k], negative = ['woman'], topn=10)
#     elif k == 'woman':
#         print (k, 'woman')
#         keyword_dict[k] = m.most_similar(positive = [k], negative = ['man', 'men'], topn=10)
#     else:
#         keyword_dict[k] = m.most_similar([k], topn=10)
    
keyword_dict

CPU times: user 1.69 s, sys: 67.5 ms, total: 1.76 s
Wall time: 1.81 s


{'man': ['he',
  'men',
  'a',
  'white',
  'isth',
  'himself',
  'who',
  'selfdefens',
  'woman',
  'excolor'],
 'woman': ['men',
  'suffrag',
  'prostitut',
  'girl',
  'childless',
  'husband',
  'spinster',
  'marriag',
  'fredericka',
  'man'],
 'ego': ['volit',
  'archist',
  'cognit',
  'extern',
  'ie',
  'archism',
  'egoism',
  'conceptu',
  'percipi',
  'perciper'],
 'egoism': ['philosophi',
  'altruism',
  'yangchu',
  'ego',
  'egoist',
  'archism',
  'agnostic',
  'thinginitself',
  'intract',
  'philosoph'],
 'egoist': ['egoism',
  'junejuli',
  'ego',
  'reissu',
  'poetryth',
  'ie',
  'archist',
  'definit',
  'implic',
  'imaginari'],
 'freewoman': ['antifeminist',
  'bondwoman',
  'illegitimaci',
  'eugenist',
  'birnstingl',
  'selwyn',
  'feminist',
  'uranian',
  'educationist',
  'madamit'],
 'feminist': ['femin',
  'superwoman',
  'humanist',
  'suffragist',
  'suffrag',
  'antisuffrag',
  'milit',
  'insurrectionari',
  'malthusian',
  'propagandist'],
 'spi

#### Remove dependencies that are not present in w2v model

A dependency might not appear in the model if it occurs too infrequently. A word must appear 10 times to be included per model parameters.

## Discover Confidence Intervals


Steps:
1. Loop through every model
2. Add vector
    - A new vector is the dependencies of a keyword in each document.
    - The vector is the average of all similarities between keyword (and it's top-10 closest words) and each dependency.
4. Return averaged similarities to dataframe.
    - There should be 41 results for each new vector.
5. Working from dataframe, discover confidence interval

What is CI here?
- CI is the average/mean cosine similarity of a keyword cluster to the new vector (dependency) in every model.

In [4]:
%%time

def add_ave_vec(mymodel, keyword_list, target_list, new_word):
    mylist = []
    
#     Find similarity of every paired combination from keyword and target lists.
#     The similarities an array of values.
    for e in itertools.product(keyword_list, target_list):
        mylist.append( ( mymodel[e[0]] + mymodel[e[1]] ) )
    
#     Average the results (the array), which is a new vector.
    new_vec = ((np.add.reduce(mylist)) / len(mylist))
    
#     Add the average similarities to the model.
    mymodel.add_vector(new_word, new_vec)
    return mymodel

CPU times: user 9 µs, sys: 7 µs, total: 16 µs
Wall time: 38.1 µs


### Find similarities of keywords & targets

In [5]:
%%time

sim_df = []

# Iterate through files & open model.
for file in glob.glob(abs_dir + '/Word-Doc_Vectors/Models/*.txt'):
    m = gensim.models.KeyedVectors.load_word2vec_format(file)
    
    reFile = str(re.search(r'.*/(.*.txt)', str(file)).group(1))

#     Iterate through dataframe.
    for idx, r in df.iterrows():
        
#         Return only keywords that appear in model, ignore the rest.
        confirmed_keywords = []
        for w in keyword_dict[ r['source'] ]:
            try:
                if m[w].any():
                    confirmed_keywords.append(w)
            except KeyError:
                pass

#         Return only target words that appear in model, ignore the rest.
        confirmed_targets = []
        for w in r['target']:
            try:
                if m[w].any():
                    confirmed_targets.append(w)
            except KeyError:
                pass
        
        if confirmed_targets:
#             The new vector will be named with metadata.
#             The reason for this to rejoin with metadata and query by document (rather than guessing dependencies).
            new_vector_name = str(r['meta_mjp_id']) + '_' + r['source']

#             Add new vector to model.
            m = add_ave_vec(m,
                            confirmed_keywords,
                            confirmed_targets, 
                            new_vector_name)

#             With new vector added to model, find similarity of new vector to it's keyword dependency.
            similarity = m.similarity(r['source'], new_vector_name)

#             Store results information as dictionary.
            results = {'vector': new_vector_name, 
                       'keyword': r['source'],
                       'similarity': similarity,
                       'model': reFile}

#             Append results to sim_df.
            sim_df.append(results)
    
        else:
            pass
    
#     Save model with added vectors.
    m.save_word2vec_format(file)
    
sim_df = pd.DataFrame(sim_df, columns = ['vector', 'keyword', 'similarity', 'model'])
print (f'Shape of sim_df: {sim_df.shape}')
sim_df.head(4)

Shape of sim_df: (177920, 4)
CPU times: user 11min 12s, sys: 2min 35s, total: 13min 48s
Wall time: 13min 58s


Unnamed: 0,vector,keyword,similarity,model
0,4_man,man,0.622661,mjp_w2v.txt
1,4_woman,woman,0.754359,mjp_w2v.txt
2,6_man,man,0.56016,mjp_w2v.txt
3,6_woman,woman,0.639118,mjp_w2v.txt


## Calculate Confidence Intervals

In [6]:
%%time

# Get general statistics of similarities for vectors that appear >= 30 times in model.
stats_df = sim_df \
    .groupby(['vector', 'keyword'], as_index = False)['similarity'] \
    .agg(['mean', 'count', 'std']) \
    .query('count >= 30') \
    .reset_index()

# Find and append confidence intervals to dataframe.
ci95_hi = []
ci95_lo = []

for i in stats_df.index:
    vector, keyword, m, c, s = stats_df.loc[i]
    ci95_hi.append( m + 1.96 * s / math.sqrt(c) )
    ci95_lo.append( m - 1.96 * s / math.sqrt(c) )

stats_df['ci_hi'] = ci95_hi
stats_df['ci_lo'] = ci95_lo

stats_df

CPU times: user 885 ms, sys: 36.5 ms, total: 922 ms
Wall time: 964 ms


Unnamed: 0,vector,keyword,mean,count,std,ci_hi,ci_lo
0,1001_man,man,0.700507,41,0.019696,0.706536,0.694478
1,1001_woman,woman,0.790924,41,0.013788,0.795144,0.786703
2,1003_man,man,0.753138,41,0.016512,0.758193,0.748084
3,1005_man,man,0.561273,41,0.024970,0.568916,0.553630
4,1006_woman,woman,0.771945,41,0.012014,0.775622,0.768267
...,...,...,...,...,...,...,...
4326,992_woman,woman,0.752123,41,0.013692,0.756314,0.747932
4327,993_man,man,0.713669,41,0.018881,0.719448,0.707889
4328,993_woman,woman,0.580068,41,0.026691,0.588238,0.571898
4329,998_man,man,0.518729,41,0.026461,0.526829,0.510630


## Re-Join Stats with Metadata and Visualize

In [7]:
%%time

stats_df['meta_mjp_id'] = stats_df['vector'].str.replace('_\w*', '').astype(int)

stats_df = df[['meta_mjp_id', 'meta_magazine', 'meta_type', 'meta_date', 'meta_year']] \
    .merge(stats_df, on = 'meta_mjp_id', how = 'inner')

stats_df.head(4)

CPU times: user 34.1 ms, sys: 7.43 ms, total: 41.6 ms
Wall time: 50.1 ms


Unnamed: 0,meta_mjp_id,meta_magazine,meta_type,meta_date,meta_year,vector,keyword,mean,count,std,ci_hi,ci_lo
0,4,The Little Review,articles,1914-12-01,1914,4_man,man,0.592083,41,0.023471,0.599268,0.584899
1,4,The Little Review,articles,1914-12-01,1914,4_woman,woman,0.724316,41,0.019561,0.730303,0.718328
2,4,The Little Review,articles,1914-12-01,1914,4_man,man,0.592083,41,0.023471,0.599268,0.584899
3,4,The Little Review,articles,1914-12-01,1914,4_woman,woman,0.724316,41,0.019561,0.730303,0.718328


In [8]:
%%time

stats_df[['mean', 'std', 'ci_hi', 'ci_lo']].describe()

CPU times: user 36.2 ms, sys: 5.95 ms, total: 42.2 ms
Wall time: 52.1 ms


Unnamed: 0,mean,std,ci_hi,ci_lo
count,6936.0,6936.0,6936.0,6936.0
mean,0.694365,0.020555,0.70066,0.688071
std,0.078929,0.0084,0.077094,0.080806
min,0.344301,0.007796,0.360745,0.327857
25%,0.648245,0.015063,0.6558,0.641193
50%,0.708307,0.018233,0.714791,0.70213
75%,0.752479,0.023401,0.757069,0.747912
max,0.861164,0.073632,0.863595,0.858734


## Visualize Trend in Keyword Similarity

In [None]:
%%time

data['date_ordinal'] = pd.to_datetime(data['meta_date']).apply(lambda date: date.toordinal())

fig, ax = plt.subplots()
sns.set(rc={'figure.figsize':(15, 9)})

sns.regplot(data = data.query('(meta_magazine == "Marsden Magazines") & \
                             (keyword == "woman")'), # (meta_type == "articles")
            x = 'date_ordinal', 
            y = 'similarity', 
            logx = True, x_jitter = 0.4, ax = ax, 
            label = "Marsden", marker = 'o')

sns.regplot(data = data.query('(meta_magazine != "Marsden Magazines") & \
                            (keyword == "woman")'),
            x = 'date_ordinal', 
            y = 'similarity', 
            logx = True, x_jitter = 0.4, ax = ax, 
            label = "MJP Rest", marker = 'x')

# Reset x-axis labels.
ax.set_xlabel('date')

new_labels = [date.fromordinal(int(item)) for item in ax.get_xticks()]

ax.set_xticklabels(new_labels)
ax.legend()

## Save Similarities to Keywords & Confidence Intervals

In [10]:
%%time

stats_df.to_csv(abs_dir + 'Marsden_Magazines/DH2022_LanguageEvolution/Data_Outputs/new-vectors_CI.csv',
                sep = '\t', index = False)

CPU times: user 86.5 ms, sys: 12.6 ms, total: 99.1 ms
Wall time: 117 ms
