In [1]:
# Turning into a dataframe because easiest way I can find online to pretty format this
import pandas as pd
import numpy as np
from pickle import load
from nltk import download
from nltk.corpus import stopwords

In [2]:
with open("tokens-exploratory/exploratory_table.pickle", "rb") as handle:
    token_table = load(handle)

token_df = pd.DataFrame.from_dict(token_table, orient="index")

In [3]:
with open("tokens-exploratory/confidence_intervals.pickle", "rb") as handle:
    CI = load(handle)

In [4]:
CI = pd.Series(CI)
token_df["PPMI CI (90%)"] = CI

split_CI = pd.DataFrame(
    token_df["PPMI CI (90%)"].to_list(), columns=["min", "max"], index=token_df.index
)
token_df["PPMI Significant"] = np.where((0 < split_CI["min"]), True, False)

In [5]:
with open("embeddings-exploratory/alc_steptwo.pickle", "rb") as handle:
    embedding_table = load(handle)

embedding_df = pd.DataFrame.from_dict(embedding_table)
embedding_df = embedding_df.set_index("token")
embedding_df = embedding_df.rename(
    columns={"ground truth": "beta", "CI": "beta CI (90%)", "p-value": "beta p-value"}
)

In [6]:
df = pd.concat([token_df, embedding_df], axis=1)

# Filter out stopwords using nltk and words not used by female speakers
stop = set(stopwords.words("english"))
filtered_df = df.loc[(~df.index.isin(stop)) & (df["PPMI"] != None)]
filtered_df = filtered_df.loc[(df["M count"] > 10) & (df["F count"] > 10)]
filtered_df = filtered_df.sort_values(by=["total"])

# filtered_df[0:30]

In [7]:
ALPHA = 0.05

is_dependent = (df["PPMI Significant"]) & (df["PPMI"]) > 0
is_correlated = df["beta p-value"] < ALPHA

dependent_correlated = filtered_df.loc[is_dependent & is_correlated]
dependent_uncorrelated = filtered_df.loc[is_dependent & ~is_correlated]
independent_correlated = filtered_df.loc[~is_dependent & is_correlated]
independent_uncorrelated = filtered_df.loc[~is_dependent & ~is_correlated]

In [8]:
df.loc["woman"]

M count                                                        113
F count                                                         67
total                                                          180
M ratio                                                   0.627778
F ratio                                                   0.372222
F - M                                                    -0.255556
PPMI                                                      1.337579
PPMI CI (90%)             [0.9207787708367393, 1.6723492227837264]
PPMI Significant                                              True
beta                                         [0.20967594220578925]
beta CI (90%)       [[0.18494049368511248], [0.21590977369175252]]
beta p-value                                               [0.155]
Name: woman, dtype: object

In [9]:
df.loc["man"]

M count                                                         56
F count                                                         37
total                                                           93
M ratio                                                   0.602151
F ratio                                                   0.397849
F - M                                                    -0.204301
PPMI                                                       1.43364
PPMI CI (90%)               [0.88135432254151, 1.8201207933988939]
PPMI Significant                                              True
beta                                          [0.2845088396204297]
beta CI (90%)       [[0.24424212982552576], [0.29478071903956393]]
beta p-value                                               [0.189]
Name: man, dtype: object

In [10]:
filtered_df.to_csv("concatenated_results_table.csv")

independent_correlated.to_csv("quadrants/independent_correlated.csv")
independent_uncorrelated.to_csv("quadrants/independent_uncorrelated.csv")
dependent_correlated.to_csv("quadrants/dependent_correlated.csv")
dependent_uncorrelated.to_csv("quadrants/dependent_uncorrelated.csv")