In [None]:
import os
import json
import re
import string
import glob
import math
import warnings
from collections import defaultdict, deque
from functools import total_ordering
from itertools import chain, islice
from operator import itemgetter

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.corpus.reader import CorpusReader
from nltk.internals import deprecated
from nltk.probability import FreqDist
from nltk.util import binary_search_file as _binary_search_file
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from textblob import TextBlob

import sklearn as skl
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(use_idf=True)

In [None]:
def get_adjectives(word_list): # , , , hyponyms, antonyms, derivationally related forms, to expand a list of seed words
    new = []
    for text in word_list:
        new.append(text)
        for syn in wordnet.synsets(text):
            also_sees = syn.also_sees() # see also
            if len(also_sees) > 0:
                for seealso in also_sees:
                    if seealso.pos() in ["a", "s", "r"]:
                        word = seealso.name().split(".")[0]
                        new.append(word)
            similar_tos = syn.similar_tos() # similar
            if len(similar_tos) > 0:
                for similar in similar_tos:
                    if similar.pos() in ["a", "s", "r"]:
                        word = similar.name().split(".")[0]
                        new.append(word)
            attributes = syn.attributes() # attributes
            if len(attributes) > 0:
                for attribute in attributes:
                    if attribute.pos() in ["a", "s", "r"]:
                        word = attribute.name().split(".")[0]
    for word in new:
        lemma = lemmatizer.lemmatize(word)
        if lemma not in new:
            new.append(lemma)
    final = []
    for word in new:
        if word not in final:
            final.append(word)
    return final


def get_positive(word_list):
    new = []
    for text in word_list:
        analysis = TextBlob(text)
        # set sentiment
        if analysis.sentiment.polarity > 0:
            new.append(text)
    return new

def get_negative(word_list):
    new = []
    for text in word_list:
        analysis = TextBlob(text)
        # set sentiment
        if analysis.sentiment.polarity < 0:
            new.append(text)
    return new
    

In [None]:
f = open("corpus_full.json")
corpora = json.load(f)
f.close()

f = open("stereotypes.json")
stereotypes = json.load(f)
f.close()

df_nyt = pd.read_csv("df_nyt.csv")


In [None]:
# term_frequency = {}

# for day, corp in corpora.items():
#     date = day[9:]
#     term_frequency[date] = {}
#     for word in corp['corpus']:
#         if word not in term_frequency[date].keys():
#             term_frequency[date][word] = 1
#         else:
#             term_frequency[date][word] += 1
    

In [None]:
# stereotypical_tf = {}

# for day, freq in term_frequency.items():
#     stereotypical_tf[day] = {}
#     stereotypical_tf[day]["Warm"] = {}
#     stereotypical_tf[day]["Cold"] = {}
#     stereotypical_tf[day]["Competent"] = {}
#     stereotypical_tf[day]["Incompetent"] = {}
#     stereotypical_tf[day]["Foreign"] = {}
#     stereotypical_tf[day]["Diseased"] = {}
#     for word, prop in freq.items():
#         if word in stereotypes['Warm']:
#             stereotypical_tf[day]["Warm"][word] = prop
#         if word in stereotypes['Cold']:
#             stereotypical_tf[day]["Cold"][word] = prop
#         if word in stereotypes['Competent']:
#             stereotypical_tf[day]["Competent"][word] = prop
#         if word in stereotypes['Incompetent']:
#             stereotypical_tf[day]["Incompetent"][word] = prop
#         if word in stereotypes['Foreign']:
#             stereotypical_tf[day]["Foreign"][word] = prop
#         if word in stereotypes['Diseased']:
#             stereotypical_tf[day]["Diseased"][word] = prop

# stereotype_tf = pd.DataFrame.from_dict({(i,j,k): stereotypical_tf[i][j][k]
#                                         for i in stereotypical_tf.keys()
#                                         for j in stereotypical_tf[i].keys()
#                                         for k in stereotypical_tf[i][j].keys()},
#                                        orient='index')
# stereotype_tf = stereotype_tf.reset_index()
# split_df = pd.DataFrame(stereotype_tf['index'].tolist(), columns=['date', 'category', 'word'])
# stereotype_tf = pd.concat([stereotype_tf, split_df], axis=1)

# stereotype_tf.columns = ["index", "frequency", "date", "category", "word"]
# stereotype_tf = stereotype_tf[["date", "category", "word", "frequency"]]
# stereotype_tf.head()

In [None]:
# full_df = pd.merge(stereotype_tf, df_nyt, on="date")
# full_df.head()

In [None]:
# full_df.to_csv("df_tf.csv", index=False)

In [None]:
term_proportion = {}

for day, corp in corpora.items():
    date = day[9:]
    term_proportion[date] = {}
    corp_total = 0
    for word in corp['corpus']:
        corp_total += 1
        if word not in term_proportion[date].keys():
            term_proportion[date][word] = 1
        else:
            term_proportion[date][word] += 1
    
    for word in term_proportion[date].keys():
        term_proportion[date][word] = term_proportion[date][word]/corp_total

"""
Before corpus of each person, see how many words fit in the dictionary (stereotypes.json) - or related words to it.
Stereotypes dictionary generation: get all the word listed under a category (e.g., warm) -> get anotonym in under the opposite category (e.g., cold) -> get synonyms of the antonyms (related words) -> select all unique items (there will be overlap) - maximum diligence

"""


In [None]:
stereotypical_tp = {}

for day, freq in term_proportion.items():
    stereotypical_tp[day] = {}
    stereotypical_tp[day]["Warm"] = {}
    stereotypical_tp[day]["Cold"] = {}
    stereotypical_tp[day]["Competent"] = {}
    stereotypical_tp[day]["Incompetent"] = {}
    stereotypical_tp[day]["Foreign"] = {}
    stereotypical_tp[day]["Diseased"] = {}
    for word, prop in freq.items():
        if word in stereotypes['Warm']:
            stereotypical_tp[day]["Warm"][word] = prop
        if word in stereotypes['Cold']:
            stereotypical_tp[day]["Cold"][word] = prop
        if word in stereotypes['Competent']:
            stereotypical_tp[day]["Competent"][word] = prop
        if word in stereotypes['Incompetent']:
            stereotypical_tp[day]["Incompetent"][word] = prop
        if word in stereotypes['Foreign']:
            stereotypical_tp[day]["Foreign"][word] = prop
        if word in stereotypes['Diseased']:
            stereotypical_tp[day]["Diseased"][word] = prop

stereotype_tp = pd.DataFrame.from_dict({(i,j,k): stereotypical_tp[i][j][k]
                                        for i in stereotypical_tp.keys()
                                        for j in stereotypical_tp[i].keys()
                                        for k in stereotypical_tp[i][j].keys()},
                                       orient='index')
stereotype_tp = stereotype_tp.reset_index()
split_df = pd.DataFrame(stereotype_tp['index'].tolist(), columns=['date', 'category', 'word'])
stereotype_tp = pd.concat([stereotype_tp, split_df], axis=1)

stereotype_tp.columns = ["index", "proportion", "date", "category", "word"]
stereotype_tp = stereotype_tp[["date", "category", "word", "proportion"]]
stereotype_tp.head()

In [None]:
full_df = pd.merge(stereotype_tp, df_nyt, on="date")
full_df.head()

In [None]:
full_df.to_csv("df_tp.csv", index=False)