In [96]:
import sys, time, argparse, re, gensim, math
import pandas as pd
from tqdm import tqdm
import pickle as pkl
import numpy as np
import faulthandler
import logging
from os.path import exists
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px
from sklearn.metrics.pairwise import euclidean_distances
import plotly.graph_objects as go
from  sklearn.preprocessing import normalize
from scipy.spatial import distance
from itertools import combinations
from plotly.subplots import make_subplots
import statistics

In [97]:
def read_file(input_path):
    with open(input_path, "rb") as f:
        input_obj = pkl.load(f)
        return input_obj
    
def save_file(output_obj, output_path):
    with open(output_path, 'wb') as f:
        pkl.dump(output_obj, f)

In [98]:
PERIODS = ['1700', '1800', '1900', 'full_corpus']
SENSES = ['sight', 'hear', 'touch', 'taste', 'smell']
px.colors.qualitative.Plotly
color_map = {'sight': px.colors.qualitative.Plotly[0],
             'hear': px.colors.qualitative.Plotly[1], 
             'taste': px.colors.qualitative.Plotly[2],
             'smell': px.colors.qualitative.Plotly[3],
             'touch': px.colors.qualitative.Plotly[4]}

## Sensory Imbalance

In [99]:
context_window_distribution = pd.DataFrame(columns = ['Period, Sense', 'period', 'modality', 'size'])

if not exists('../git_data/sensory_imbalance_df.pickle'):
    
    for period in PERIODS: 
        context_windows = read_file('../data/' + period + '/context_windows.pickle')
        for sense in SENSES:
            if period == "full_corpus":
                period = "Full \n Corpus"
            context_window_distribution = context_window_distribution.append({'Period, Sense':str(period) + ", \n " + sense,
                                                                              'period':period,
                                                                              'modality' : sense,
                                                                              'size' : len(context_windows[sense])} , ignore_index = True)
    grouped_size = context_window_distribution.groupby(['Period, Sense']).sum().reset_index()
    grouped_size['normalized_size'] = normalize([list(grouped_size['size'])])[0]

    save_file(grouped_size, '../git_data/sensory_imbalance_df.pickle')
    
else:
    grouped_size = read_file('../git_data/sensory_imbalance_df.pickle')

grouped_size[grouped_size["modality"] == "sight"]["period"]
fig = go.Figure(data=[
    go.Bar(marker_color=color_map["sight"], name="sight", x=grouped_size[grouped_size["modality"] == "sight"]["period"], y=grouped_size[grouped_size["modality"] == "sight"]["size"]),
    go.Bar(marker_color=color_map["hear"], name="hear", x=grouped_size[grouped_size["modality"] == "hear"]["period"], y=grouped_size[grouped_size["modality"] == "hear"]["size"]),
    go.Bar(marker_color=color_map["smell"], name="smell", x=grouped_size[grouped_size["modality"] == "smell"]["period"], y=grouped_size[grouped_size["modality"] == "smell"]["size"]),
    go.Bar(marker_color=color_map["touch"], name="touch", x=grouped_size[grouped_size["modality"] == "touch"]["period"], y=grouped_size[grouped_size["modality"] == "touch"]["size"]),
    go.Bar(marker_color=color_map["taste"], name="taste", x=grouped_size[grouped_size["modality"] == "taste"]["period"], y=grouped_size[grouped_size["modality"] == "taste"]["size"])
])
    
fig.update_yaxes(type="log",)    
fig.update_layout(barmode='group', 
                  template="plotly_white",
                  title_text='Sensory Imbalance In Context Windows',
                  xaxis_title="Literary Periods",
                  yaxis_title="Log-Scaled Context Window Set Size",)

# fig.write_image("visuals/sensory_imbalance_bar.pdf")
fig.show()

## Sense Pairs 

In [100]:
SENSES = ['sight', 'hear', 'touch', 'taste', 'smell']
radius = 30
literary_periods = ['1700', '1800', '1900', 'full_corpus']

def format_key(pair):
    return ", ".join(sorted(pair))


def avg(value):
    ret = 0 if value[1] == 0 else value[0]/value[1]
    return ret

fig = make_subplots(rows=2, 
                    cols=2, 
                    specs = [[{}, {}], [{}, {}]],
                    vertical_spacing = 0.4,
                    x_title='Sense Pairs',
                    y_title='Normalized Average Distance',
                    subplot_titles=("1700 Literary Period",
                                    "1800 Literary Period",
                                    "1900 Literary Period",
                                    "Full Corpus Literary Period"))


fig.layout.annotations[4]["yshift"] = -90
positions = [(1,1), (1,2), (2,1), (2,2)]

fig2 = make_subplots(rows=2, 
                    cols=2, 
                    specs = [[{}, {}], [{}, {}]],
                    vertical_spacing = 0.4,
                    x_title='Sense Pairs',
                    y_title='Frequency of Pairs within Radius of 30',
                    subplot_titles=("1700 Literary Period",
                                    "1800 Literary Period",
                                    "1900 Literary Period",
                                    "Full Corpus Literary Period"))

fig2.layout.annotations[4]["yshift"] = -90

for position_num, period in enumerate(literary_periods):
    
    if exists('../git_data/' + period + '/' + period + '_average_distance_between_sense_pairs_500_fig.pickle'):
        plot = read_file('../git_data/' + period + '/' + period + '_average_distance_between_sense_pairs_500_fig.pickle')
    
        plot.update_layout(xaxis_title="Sense Pair",
                          yaxis_title="Normalized Average Distance")
        fig.add_trace(plot.data[0], row = positions[position_num][0], col = positions[position_num][1])
        fig.update_layout(showlegend=False)
        
        
        plot2 = read_file('../git_data/' + period + '/' + period + '_total_sense_pairs_500_fig.pickle')
    
        plot2.update_layout(xaxis_title="Sense Pair",
                          yaxis_title="Pairs within radius of 30")
        fig2.add_trace(plot2.data[0], row = positions[position_num][0], col = positions[position_num][1])
        fig2.update_layout(showlegend=False)
        
        
    else:
        
        senses_and_freqs = {format_key((SENSES[i],SENSES[j])):0 for i in range(len(SENSES)) for j in range(i, len(SENSES))}
        senses_and_avgs = {format_key((SENSES[i],SENSES[j])):[0,0] for i in range(len(SENSES)) for j in range(i, len(SENSES))}

        all_top_descriptors = read_file('../git_data/' + period + '/' + period + '_all_top_descriptors_500_PAI_None.pickle')

        points = []
        points_with_sense = []
        for index, row in all_top_descriptors.iterrows():
            points.append([row['Principal Component 1'], row['Principal Component 2']])
            points_with_sense.append([[row['Principal Component 1'], row['Principal Component 2']], row['modality']])

        euclidean_dists = euclidean_distances(points, points)

        total = 0
        same_sense = 0
        visited_points = []
        for i, row in enumerate(euclidean_dists):
            for j, distance in enumerate(row):
                if i == j or sorted((i,j)) in visited_points: continue
                visited_points.append(sorted((i,j)))
                sense_i = points_with_sense[i][1]
                sense_j = points_with_sense[j][1]
                pair = format_key([sense_i, sense_j])
                senses_and_avgs[pair][0] += distance
                senses_and_avgs[pair][1] += 1
                if distance > radius: continue
                senses_and_freqs[pair] += 1
                if sense_i == sense_j:
                    same_sense += 1
                total += 1
        senses_and_avgs = {pair:avg(value) for pair, value in senses_and_avgs.items()}

        save_file(senses_and_freqs, '../git_data/' + period + '/' + period + '_senses_and_freqs_pairs.pickle')
        save_file(senses_and_avgs, '../git_data/' + period + '/' + period + '_senses_and_avgs_pairs.pickle')

        colors1 = ['lightslategray',] * len(senses_and_freqs)
        same_indices = [i for i, pair in enumerate(senses_and_freqs) if pair.split(", ")[0] == pair.split(", ")[1]]
        for i in same_indices:
            colors1[i] = "crimson"
        layout = go.Layout(
            title = period + " - Total Number of Sense Pairs",
            xaxis = dict(title="Sense Pair"),
            yaxis = dict(title="Pairs within radius of " + str(radius))
        )

        fig = go.Figure(layout=layout,
                        data=[go.Bar(x = list(senses_and_freqs.keys()),
                                y = normalize([list(senses_and_freqs.values())])[0],
                        marker_color=colors1)])
        fig.update_yaxes(range=[0, 1])
        fig.show()
        fig.write_html('../visuals/' + period + '/' + period + '_total_sense_pairs_500.html')
        save_file(fig, '../git_data/' + period + '/' + period + '_total_sense_pairs_500_fig.pickle')

        layout = go.Layout(
                title = period + " - Average Distance Between Sense Pairs",
                xaxis = dict(title="Sense Pair"),
                yaxis = dict(title="Average Distance")
            )
        colors2 = ['lightslategray',] * len(senses_and_avgs)
        same_indices = [i for i, pair in enumerate(senses_and_avgs) if pair.split(", ")[0] == pair.split(", ")[1]]
        for i in same_indices:
            colors2[i] = "mediumvioletred"
        fig = go.Figure(layout=layout,
                        data=[go.Bar(x = list(senses_and_avgs.keys()),
                                y = normalize([list(senses_and_avgs.values())])[0],
                        marker_color=colors2)])
        fig.update_yaxes(range=[0, 1])
        fig.write_html('../visuals/' + period + '/' + period + '_average_distance_between_sense_pairs_500.html')
        save_file(fig, '../git_data/' + period + '/' + period + '_average_distance_between_sense_pairs_500_fig.pickle')
        fig.show()

In [101]:
fig.update_layout(template="plotly_white",
                  title_text= 'Average Distance Between Sense Pairs',
                  autosize=False,
                    width=1000,
                    height=500,
                    margin=dict(
                        l=115,
                        r=50,
                        b=115,
                        t=90,
                        pad=2
                    ))

fig.update_yaxes(range=[0, 0.5])
fig.update_xaxes(tickangle=270)
fig.show()
fig.write_html('../visuals/average_distance_between_sense_pairs_500.html')
# fig.write_image("visuals/average_distance_between_sense_pairs_500_bar.pdf")

fig2.update_layout(template="plotly_white",
                  title_text= 'Total Sense Pairs',
                  autosize=False,
                    width=1000,
                    height=500,
                    margin=dict(
                        l=115,
                        r=50,
                        b=115,
                        t=90,
                        pad=2
                    ))

fig2.update_yaxes(range=[0, 0.5])
fig2.update_xaxes(tickangle=270)
fig2.show()
fig2.write_html('../visuals/total_sense_pairs_500.html')
# fig.write_image("visuals/total_sense_pairs_500_bar.pdf")

## Sense Triplets

In [102]:
# SENSES = ['sight', 'hear', 'touch', 'taste', 'smell']
# radius = 30
# literary_periods = ['1700', '1800', '1900', 'full_corpus']


# def format_key(triplet):
#     return ", ".join(sorted(triplet))

# def avg(value):
#     ret = 0 if value[1] == 0 else value[0]/value[1]
#     return ret
    
# for period in literary_periods:
#     all_top_descriptors = read_file('../git_data/' + period + '/' + period + '_all_top_descriptors_500_PAI_None.pickle')
    
#     senses_and_freqs = {format_key((SENSES[i],SENSES[j],SENSES[k])):0 for i in range(len(SENSES)) for j in range(i, len(SENSES)) for k in range(j, len(SENSES))}
#     senses_and_avgs = {format_key((SENSES[i],SENSES[j],SENSES[k])):[0,0] for i in range(len(SENSES)) for j in range(i, len(SENSES)) for k in range(j, len(SENSES))}
    
#     points = []
#     for index, row in all_top_descriptors.iterrows():
#         points.append([row['Principal Component 1'], row['Principal Component 2']])
#         points_with_sense.append([[row['Principal Component 1'], row['Principal Component 2']], row['modality']])
        
#     euclidean_dists = euclidean_distances(points, points)
    
#     two_points = []
#     visited_points = []
#     for i, row in enumerate(euclidean_dists):
#         for j, distance in enumerate(row):
#             if i == j or sorted((i,j)) in visited_points: continue
#             visited_points.append(sorted((i,j)))
#             sense_i = points_with_sense[i][1]
#             sense_j = points_with_sense[j][1]
#             two_points.append([(i,j), distance, [sense_i, sense_j]])
                
#     visited_points = []
#     for value in two_points:
#         pair, distance, modalities = value
#         i = pair[0]
#         j = pair[1]
#         sense_i = modalities[0]
#         sense_j = modalities[1]
#         row = euclidean_dists[i]
#         for k, distance2 in enumerate(row):
#             if i == k or j == k or sorted((i,j, k)) in visited_points: continue
#             visited_points.append(sorted((i,j,k)))
#             sense_k = points_with_sense[k][1]
#             distance3 = euclidean_dists[j][k]
#             triplet = format_key([sense_i, sense_j, sense_k])
#             senses_and_avgs[triplet][0] += sum([distance, distance2, distance3])
#             senses_and_avgs[triplet][1] += 1
#             if distance > radius or distance2 > radius: continue
#             senses_and_freqs[triplet] += 1
                
#     senses_and_avgs = {pair:avg(value) for pair, value in senses_and_avgs.items()}
    
#     save_file(senses_and_freqs, '../git_data/' + period + '/' + period + '_senses_and_freqs_triplets.pickle')
#     save_file(senses_and_avgs, '../git_data/' + period + '/' + period + '_senses_and_avgs_triplets.pickle')
    
#     colors1 = ['lightslategray',] * len(senses_and_freqs)
#     same_indices = [i for i, pair in enumerate(senses_and_freqs) if pair.split(", ")[0] == pair.split(", ")[1] and pair.split(", ")[1] == pair.split(", ")[2]]
#     for i in same_indices:
#         colors1[i] = "crimson"
#     layout = go.Layout(
#         title = period + " - Total Number of Sense Triplets",
#         xaxis = dict(title="Sense Triplets"),
#         yaxis = dict(title="Triplets within radius of " + str(radius))
#     )

#     fig = go.Figure(layout=layout,
#                     data=[go.Bar(x = list(senses_and_freqs.keys()),
#                             y = normalize([list(senses_and_freqs.values())])[0],
#                     marker_color=colors1)])

#     fig.update_yaxes(range=[0, 1])
#     fig.show()
#     fig.write_html('../visuals/' + period + '/' + period + '_total_sense_triplets_500.html')
#     save_file(fig, '../git_data/' + period + '/' + period + '_total_sense_triplets_500.pickle')

#     layout = go.Layout(
#             title = period + " - Average Distance Between Sense Triplets",
#             xaxis = dict(title="Sense Triplets"),
#             yaxis = dict(title="Average Distance")
#         )
#     colors2 = ['lightslategray',] * len(senses_and_avgs)
#     same_indices = [i for i, pair in enumerate(senses_and_avgs) if pair.split(", ")[0] == pair.split(", ")[1] and pair.split(", ")[1] == pair.split(", ")[2]]
#     for i in same_indices:
#         colors2[i] = "mediumvioletred"
#     fig = go.Figure(layout=layout,
#                     data=[go.Bar(x = list(senses_and_avgs.keys()),
#                             y = normalize([list(senses_and_avgs.values())])[0],
#                     marker_color=colors2)])
#     fig.update_yaxes(range=[0, 1])
#     fig.show()
#     fig.write_html('../visuals/' + period + '/' + period + '_average_distance_between_sense_triplets_500.html')
#     save_file(fig, '../git_data/' + period + '/' + period + '_average_distance_between_sense_triplets_500.pickle')
    

## Ranked Descriptors

In [103]:
literary_periods = ['1700', '1800', '1900', 'full_corpus']
fig = make_subplots(rows=2, 
                    cols=2, 
                    specs = [[{}, {}], [{}, {}]],
                    vertical_spacing = 0.4,
                    x_title='Top Descriptors',
                    y_title='Normalized Frequency',
                    subplot_titles=("1700 Literary Period",
                                    "1800 Literary Period",
                                    "1900 Literary Period",
                                    "Full Corpus Literary Period"))
positions = [(1,1), (1,2), (2,1), (2,2)]
fig.layout.annotations[4]["yshift"] = -90

for num, period in enumerate(literary_periods): 
    all_top_descriptors = read_file('../git_data/' + period + '/' + period + '_all_top_descriptors_500_PAI_None.pickle')
    filtered_descriptors = read_file('../data/' + period + '/filtered_descriptors.pickle')
    context_windows = read_file('../data/' + period + '/context_windows.pickle')
    
    all_top_descriptors = all_top_descriptors.reset_index(drop = True)
    all_top_descriptors['descriptor'] = all_top_descriptors['word'].str[0]
    all_top_descriptors['POS'] = all_top_descriptors['word'].str[1]
    senses = [sense for sense in set(all_top_descriptors["modality"].values)]
    ranked = pd.DataFrame(columns=["descriptor", "POS"] + senses + ['total'])
    ranked.loc[:,'descriptor'] = all_top_descriptors['descriptor']
    ranked.loc[:,'POS'] = all_top_descriptors["POS"]
    ranked = ranked.drop_duplicates().fillna(0).reset_index(drop=True)
    for index, row in all_top_descriptors.iterrows():
        # get row of matching descriptor
        ranked_row = ranked.loc[(ranked['descriptor'] == row['descriptor']) & (ranked['POS'] == row['POS'])]
        ranked_idx = ranked_row.index
        ranked.at[ranked_idx[0], row["modality"]] = filtered_descriptors[row['modality']][row['word']]

    # Calculate the total frequency for each descriptor
    for sense in senses:
        ranked["total"] += ranked[sense]

    ranked = ranked.sort_values("total", ascending=False)

    # Remove all the descriptors that show no overlap of senses
    one_sense_only = ranked[(ranked == 0).sum(1) >= 4].index
    ranked = ranked.drop(one_sense_only)
    words = ranked["descriptor"].values
    n = min(20, len(words))
    data = []
    for label in SENSES:
        try:
            fig.add_trace(go.Bar(marker_color=color_map[label],
                                 name=label, x=words[:n],
                                 y=ranked[label][:n].values/len(context_windows[label]),
                                 showlegend=False if period != "full_corpus" else True),
                          row = positions[num][0],
                          col = positions[num][1])
            #data.append(go.Bar(marker_color=color_map[label], name=label, x=words[:n], y=ranked[label][:n].values/len(context_windows[label])))
        except KeyError:
            continue
            
    if period == "full_corpus":
        title_period = "Full Corpus"
    else:
        title_period = period

fig.update_layout(template="plotly_white",
                  barmode='stack',
                  autosize=False,
                  width=1000,
                  height=500,
                  margin=dict(
                    l=115,
                    r=50,
                    b=115,
                    t=90,
                    pad=2),
                  title_text = "Top Overlapping Descriptors")
fig.update_xaxes(tickangle=270)
fig.write_html('../visuals/' + period + '/' + period + '_top_descriptors_overlap_500.html')
fig.update_yaxes(range=[0, 0.009])
save_file(fig, '../git_data/' + period + '/' + period + '_top_descriptors_overlap_500.pickle')
# fig.write_image("visuals/top_descriptors_overlap_500_bar.pdf")
fig.show()

## Top 500 Descriptors

In [104]:
file_names = ['PAI_None', 'TF-IDF_method_one', 'TF-IDF_method_two', 'TF-IDF_method_three', 'TF-IDF_method_four']
method_names = ['PAI', 'TF-IDF Method One', 'TF-IDF Method Two', 'TF-IDF Method Three', 'TF-IDF Method Four']

for index, file_name in enumerate(file_names):
    fig = make_subplots(rows=2,
                    cols=2,
                    x_title='Principal Component 1',
                    y_title='Principal Component 2',
                    subplot_titles=("1700 Literary Period", "1800 Literary Period", "1900 Literary Period", "Full Corpus Literary Period"))
    positions = [(1,1), (1,2), (2,1), (2,2)]
    legend = []
    for num, period in enumerate(PERIODS): 
        all_top_descriptors = read_file('../git_data/' + period + '/' + period + '_all_top_descriptors_500_' +  file_names[index] + '.pickle')
        for sense in SENSES: 
            fig.add_trace(
                go.Scatter(x=all_top_descriptors[all_top_descriptors["modality"] == sense]["Principal Component 1"],
                           y=all_top_descriptors[all_top_descriptors["modality"] == sense]["Principal Component 2"],
                           mode = 'markers',
                           name = sense,
                           marker=dict(size=8,opacity=0.7),
                           showlegend=False if period != "full_corpus" else True,
                           marker_color=color_map[sense]),
                row=positions[num][0], 
                col=positions[num][1],
            )
            legend.append(sense)

    fig.update_layout(template="plotly_white",
                      title_text=method_names[index] + ': Top 500 Descriptors 2-Component PCA')
    
    fig.write_html('../visuals/' + file_names[index] + "_500_top_descriptors_scatter.html")
    # fig.write_image("visuals/" + file_names[index] + "_500_top_descriptors_scatter.pdf")
    fig.show()

## 1700 Top Descriptors

In [105]:
for period in PERIODS: 
    fig = make_subplots(rows=3,
                        cols=2,
                        specs = [[{}, {}], [{}, {}], [{"colspan": 2}, None]],
                        subplot_titles=("Ranking: TD-IDF Method 1",
                                        "Ranking: TD-IDF Method 2",
                                        "Ranking: TD-IDF Method 3",
                                        "Ranking: TD-IDF Method 4", "Ranking: PAI"))
    positions = [(1,1), (1,2), (2,1), (2,2), (3,1)]
    file_names = ["500_TF-IDF_method_one", "500_TF-IDF_method_two", "500_TF-IDF_method_three", "500_TF-IDF_method_four","500_PAI_None"]
    legend = []

    for num, name in enumerate(file_names): 
        all_top_descriptors = read_file('../git_data/' + period + '/' + period + '_all_top_descriptors_' + name + '.pickle')
        for sense in SENSES: 
            fig.add_trace(
                go.Scatter(x=all_top_descriptors[all_top_descriptors["modality"] == sense]["Principal Component 1"],
                           y=all_top_descriptors[all_top_descriptors["modality"] == sense]["Principal Component 2"],
                           mode = 'markers',
                           name = sense,
                           marker=dict(size=8,opacity=0.7),
                           showlegend=False if name != "500_TF-IDF_method_four" else True,
                           marker_color=color_map[sense]),
                row=positions[num][0], 
                col=positions[num][1],
            )
            legend.append(sense)
    correct_period = "Full Corpus" if period == "full_corpus" else period 
    
    fig.update_layout(template="plotly_white",
                      title_text=correct_period + ' Literary Period: 2-Component PCA Of Top Descriptors',
                      height=650,)
    fig.show()