In [1172]:
import os
import re
import string
import ast
from pathlib import Path
from glob import glob
import json
import pickle
import random
from tqdm import tqdm
import itertools
from collections import Counter
from dataclasses import dataclass, field
import contextlib
import csv

from collections import defaultdict, deque
from functools import total_ordering
from itertools import chain, islice
from operator import itemgetter

import pandas as pd
import numpy as np
import math

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.corpus.reader import CorpusReader
from nltk.internals import deprecated
from nltk.probability import FreqDist
from nltk.util import binary_search_file as _binary_search_file
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from textblob import TextBlob

import sklearn as skl
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(use_idf=True)

import warnings
import pprint


In [1173]:
# nltk.download('wordnet')

# Custom functions

In [1174]:
def get_synonyms(word):
    new = []
    # Handle both string and list inputs
    if isinstance(word, str):
        words_to_check = [word]
    elif isinstance(word, list):
        words_to_check = word
    else:
        return []  # Return empty list if word is neither string nor list
    
    for text in words_to_check:
        new.append(text)
        for syn in wordnet.synsets(text):
            # Hypernyms
            hypernyms = syn.hypernyms()
            if len(hypernyms) > 0:
                for hypernym in hypernyms:
                    if hypernym.pos() in ["a", "s", "r"]:
                        word = hypernym.name().split(".")[0]
                        new.append(word)
            # Hyponyms
            hyponyms = syn.hyponyms()
            if len(hyponyms) > 0:
                for hyponym in hyponyms:
                    if hyponym.pos() in ["a", "s", "r"]:
                        word = hyponym.name().split(".")[0]
                        new.append(word)
            # Holonyms
            member_holonyms = syn.member_holonyms()
            if len(member_holonyms) > 0:
                for holonym in member_holonyms:
                    if holonym.pos() in ["a", "s", "r"]:
                        word = holonym.name().split(".")[0]
                        new.append(word)
            substance_holonyms = syn.substance_holonyms()
            if len(substance_holonyms) > 0:
                for holonym in substance_holonyms:
                    if holonym.pos() in ["a", "s", "r"]:
                        word = holonym.name().split(".")[0]
                        new.append(word)
            part_holonyms = syn.part_holonyms()
            if len(part_holonyms) > 0:
                for holonym in part_holonyms:
                    if holonym.pos() in ["a", "s", "r"]:
                        word = holonym.name().split(".")[0]
                        new.append(word)
            # Meronyms
            member_meronyms = syn.member_meronyms()
            if len(member_meronyms) > 0:
                for meronym in member_meronyms:
                    if meronym.pos() in ["a", "s", "r"]:
                        word = meronym.name().split(".")[0]
                        new.append(word)
            substance_meronyms = syn.substance_meronyms()
            if len(substance_meronyms) > 0:
                for meronym in substance_meronyms:
                    if meronym.pos() in ["a", "s", "r"]:
                        word = meronym.name().split(".")[0]
                        new.append(word)
            part_meronyms = syn.part_meronyms()
            if len(part_meronyms) > 0:
                for meronym in part_meronyms:
                    if meronym.pos() in ["a", "s", "r"]:
                        word = meronym.name().split(".")[0]
                        new.append(word)
            # Also see
            also_sees = syn.also_sees()
            if len(also_sees) > 0:
                for seealso in also_sees:
                    if seealso.pos() in ["a", "s", "r"]:
                        word = seealso.name().split(".")[0]
                        new.append(word)
            # Similar to
            similar_tos = syn.similar_tos()
            if len(similar_tos) > 0:
                for similar in similar_tos:
                    if similar.pos() in ["a", "s", "r"]:
                        word = similar.name().split(".")[0]
                        new.append(word)
            # Attributes
            attributes = syn.attributes()
            if len(attributes) > 0:
                for attribute in attributes:
                    if attribute.pos() in ["a", "s", "r"]:
                        word = attribute.name().split(".")[0]
                        new.append(word)
            # Synonyms
            if syn.pos() in ["a", "s", "r"]:
                word = syn.name().split(".")[0]
                if word not in new:
                    new.append(word)
                # Derivatives
                lemmas = wordnet.lemmas(syn.name().split(".")[0], syn.name().split(".")[1])
                if len(lemmas) > 0:
                    for lemma in lemmas:
                        if lemma.syntactic_marker():
                            new.append(lemma.name())
                        else:
                            pass
    for word in new:
        lemma = lemmatizer.lemmatize(word)
        if lemma not in new:
            new.append(lemma)
    final = []
    for word in new:
        if word not in final:
            final.append(word)
    return list(set(final)) #Get only unique values

In [1175]:
def get_antonyms(word):
    new = []
    # Handle both string and list inputs
    if isinstance(word, pd.Series):
        # Flatten the Series of lists into a single list
        words_to_check = [item for sublist in word.tolist() for item in sublist]
    if isinstance(word, str):
        words_to_check = [word]
    elif isinstance(word, list):
        words_to_check = word
    
    for text in words_to_check:
        # Get all synsets for the word
        for syn in wordnet.synsets(text):
            # Get all lemmas for the synset
            for lemma in syn.lemmas():
                # Get antonyms for each lemma
                antonyms = lemma.antonyms()
                if antonyms:  # Check if there are any antonyms
                    for antonym in antonyms:
                        word = antonym.name()
                        new.append(word)
    
    # Lemmatize and deduplicate
    final = []
    for word in new:
        lemma = lemmatizer.lemmatize(word)
        if lemma not in final:
            final.append(lemma)
    
    return list(set(final))

In [1176]:
df = pd.read_csv("Raw_vocab_dictionary.csv")

# Check categories
print(df['Category'].unique())

#Synonym and Antonyms of each word
df["Synonyms"] = df["Word"].apply(get_synonyms)
df["Antonyms"] = df["Word"].apply(get_antonyms)

word_dict = defaultdict(list)

for idx, row in df.iterrows():
    comp_antonyms = []
    if row["Category"] == "Cold":
        cold_antonyms = get_antonyms(row["Synonyms"])
        word_dict["Cold"].extend(row["Synonyms"])
    if row["Category"] == "Warm":
        warm_antonyms = get_antonyms(row["Synonyms"])
        word_dict["Warm"].extend(row["Synonyms"])
    if row["Category"] == "Competence":
        comp_antonyms = get_antonyms(row["Synonyms"])
        word_dict["Competence"].extend(row["Synonyms"])
    if row["Category"] == "Incompetence":
        incomp_antonyms = get_antonyms(row["Synonyms"])
        word_dict["Incompetence"].extend(row["Synonyms"])
    if row["Category"] == "Jews":
        word_dict["Jews"].extend(row["Synonyms"])
    if row["Category"] == "Christians":
        word_dict["Christians"].extend(row["Synonyms"])

    word_dict["Warm"].extend(cold_antonyms)
    word_dict["Cold"].extend(warm_antonyms)
    word_dict["Competence"].extend(incomp_antonyms)
    word_dict["Incompetence"].extend(comp_antonyms)


word_dict["Warm"] = list(set(word_dict["Warm"]))
word_dict["Cold"] = list(set(word_dict["Cold"]))
word_dict["Competence"] = list(set(word_dict["Competence"]))
word_dict["Incompetence"] = list(set(word_dict["Incompetence"]))


df.head()

['Incompetence' 'Cold' 'Warm' 'Jews' 'Competence' 'Christians']


Unnamed: 0,Category,Word,Synonyms,Antonyms
0,Incompetence,unskilled,"[humble, unskilled, bad, hopeless, unprofessio...",[skilled]
1,Cold,untrustworthy,"[unreliable, fly-by-night, slippery, devious, ...",[trustworthy]
2,Warm,warm,"[loving, enthusiastic, warming, warm, near, em...",[cool]
3,Warm,friendly,"[social, warm, sociable, couthie, affable, com...","[unfriendly, hostile]"
4,Jews,Jewish,"[Jewish, jewish]",[]


In [1177]:
df.to_csv("Processed_vocab_dictionary.csv", index=False)
with open('Stereotype_Dictionary.json', 'w') as fp:
    json.dump(word_dict, fp)

pprint.pprint(dict(word_dict))


{'Christians': ['Christians',
                'Protestant',
                'complaining',
                'protestant',
                'Catholic',
                'catholic',
                'broad-minded',
                'christlike',
                'christianly',
                'Christian',
                'christian'],
 'Cold': ['unpatriotic',
          'recreant',
          'fly-by-night',
          'unforgiving',
          'refrigerant',
          'unpleasant',
          'faithless',
          'unfriendly',
          'selfish',
          'acold',
          'deceitful',
          'passionless',
          'discourage',
          'unvoiced',
          'ignoble',
          'neutral',
          'cerebral',
          'merciless',
          'undependable',
          'undesirable',
          'corrupt',
          'loud',
          'inhumane',
          'normal',
          'dishonorable',
          'base',
          'weak',
          'uncordial',
          'dishonest',
          'impot

# For GloVe

## Pre-post corpus dictionary

In [1178]:
# result = glob.glob('../raw/Full/*.csv')
# result.sort()

# result_dict = {}

# result_dict["pre"] = {}
# result_dict["post"] = {}

# response_pre = []
# response_deduped_pre = []

# for r in result[:324]:
#     print(r)
#     df = pd.read_csv(r, lineterminator='\n')
#     for row in df['lemma']:
#         row = ast.literal_eval(row)
#         for word in row:
#             if word != " ":
#                 response_pre.append(word)
                
# # Create deduped list to get length of unique words
# for token in tqdm(response_pre):
#     if token not in response_deduped_pre:
#         response_deduped_pre.append(token)
            
# result_dict["pre"]["corpus"] = response_pre
# result_dict["pre"]["length"] = len(response_deduped_pre)

# with open("corpus_prepost.json", "w") as outfile:
#     json.dump(result_dict, outfile)


In [1179]:
# f = open("corpus_prepost.json")
# result_dict = json.load(f)
# f.close()

# response_post = []
# response_deduped_post = []

# for r in tqdm(result[324:]):
#     df = pd.read_csv(r, lineterminator='\n')
#     for row in df['lemma']:
#         row = ast.literal_eval(row)
#         for word in row:
#             if word != " ":
#                 response_post.append(word)

# # Create deduped list to get length of unique words
# for token in tqdm(response_post):
#     if token not in response_deduped_post:
#         response_deduped_post.append(token)
            
# result_dict["post"]["corpus"] = response_post
# result_dict["post"]["length"] = len(response_deduped_post) 

# with open("corpus_prepost.json", "w") as outfile:
#     json.dump(result_dict, outfile)


## Daily full corpus dictionary

In [1180]:
# result = glob.glob('../raw/Full/*.csv')

# result_dict = {}
# for r in result:
#     df = pd.read_csv(r, lineterminator='\n')
#     response = []
#     response_deduped = []
#     for row in df['lemma']:
#         row = ast.literal_eval(row)
#         for word in row:
#             if word != " ":
#                 response.append(word)
#     # Create deduped list to get length of unique words
#     for token in response:
#         if token not in response_deduped:
#             response_deduped.append(token)
#     key = re.sub(".csv$", "", r)
#     key = key[-19:]
#     result_dict[key] = {}
#     result_dict[key]["corpus"] = response
#     result_dict[key]["length"] = len(response_deduped)        

# with open("corpus_full.json", "w") as outfile:
#     json.dump(result_dict, outfile)


## Daily Asian corpus dictionary

In [1181]:
# result = glob.glob('../../raw/consolidated/Asian/*.csv')

# result_dict_asian = {}
# for r in result:
#     df = pd.read_csv(r, lineterminator='\n')
#     response = []
#     response_deduped = []
#     for row in df['lemma']:
#         row = ast.literal_eval(row)
#         for word in row:
#             if word != " ":
#                 response.append(word)
#     # Create deduped list to get length of unique words
#     for token in response:
#         if token not in response_deduped:
#             response_deduped.append(token)
#     key = re.sub(".csv$", "", r)
#     key = key[-19:]
#     result_dict_asian[key] = {}
#     result_dict_asian[key]["corpus"] = response
#     result_dict_asian[key]["length"] = len(response_deduped)        

# with open("corpus_asian.json", "w") as outfile:
#     json.dump(result_dict_asian, outfile)


## Daily COVID corpus dictionary

In [1182]:
# result = glob.glob('../../raw/consolidated/COVID/*.csv')

# result_dict_covid = {}
# for r in result:
#     df = pd.read_csv(r, lineterminator='\n')
#     response = []
#     response_deduped = []
#     for row in df['lemma']:
#         row = ast.literal_eval(row)
#         for word in row:
#             if word != " ":
#                 response.append(word)
#     # Create deduped list to get length of unique words
#     for token in response:
#         if token not in response_deduped:
#             response_deduped.append(token)
#     key = re.sub(".csv$", "", r)
#     key = key[-19:]
#     result_dict_covid[key] = {}
#     result_dict_covid[key]["corpus"] = response
#     result_dict_covid[key]["length"] = len(response_deduped)        

# with open("corpus_covid.json", "w") as outfile:
#     json.dump(result_dict_covid, outfile)


## Config dictionaries

In [1183]:
# config_directory = {}

# config = {"device": "cpu",
#           "window_size": 15,
#           "num_partitions": 15,
#           "x_max": 10,
#           "alpha": 0.75,
#           "batch_size": 32,
#           "num_epochs": 10,
#           "embedding_size": 50}

# for key in result_dict.keys():
#     config["input_filepath"] = f"{key}.txt"
#     config["output_filepath"] = f"output/{key}.pkl"
#     config["vocab_size"] = result_dict[key]["length"]
#     config["chunk_size"] = result_dict[key]["length"]
#     config["cooccurrence_dir"] = f"cooccurrence_directory/{key}.pkl"
#     config["hdf5_file"] = f"hdf5_directory/{key}.hdf5"
#     config_directory[key] = config

# with open("config_prepost.json", "w") as outfile:
#     json.dump(config_directory, outfile)

# config_directory = {}

# for key in result_dict.keys():
#     config["input_filepath"] = f"{key}.txt"
#     config["output_filepath"] = f"output/{key}.pkl"
#     config["vocab_size"] = result_dict[key]["length"]
#     config["chunk_size"] = result_dict[key]["length"]
#     config["cooccurrence_dir"] = f"cooccurrence_directory/{key}.pkl"
#     config["hdf5_file"] = f"hdf5_directory/{key}.hdf5"
#     config_directory[key] = config

# with open("config_full.json", "w") as outfile:
#     json.dump(config_directory, outfile)

# config_directory_asian = {}

# for key in result_dict_asian.keys():
#     config_directory_asian[key] = {}
#     config["input_filepath"] = f"{key}.csv"
#     config["output_filepath"] = f"output/{key}.pkl"
#     config["vocab_size"] = result_dict_asian[key]["length"]
#     config["chunk_size"] = result_dict_asian[key]["length"]
#     config["cooccurrence_dir"] = f"cooccurrence_directory/{key}.pkl"
#     config["hdf5_file"] = f"hdf5_directory/{key}.hdf5"
#     config_directory_asian[key] = config

# with open("config_asian.json", "w") as outfile:
#     json.dump(config_directory_asian, outfile)
    

# Stereotypes dictionary

In [1184]:
# stereotypes_df = pd.read_csv("Kurdi et al./Kurdi, Mann, Charlesworth, & Banaji (2018) Vectors.csv")
# stereotypes_df = stereotypes_df.groupby('category')['word'].apply(list).to_dict()

# stereotypes = {}
# keywords = ['Cold', 'Warm', 'Competence', 'Incompetence']
# for key in stereotypes_df.keys():
#     if key in keywords:
#         word_list = stereotypes_df[key]
#         word_list = get_synonyms(word_list)
        
#         stereotypes[key] = []
#         for word in word_list:
#             if word not in stereotypes[key]:
#                 stereotypes[key].append(word)

# stereotypes

In [1185]:
### Stereotypes dict

# stereotypes = {"Cold": ['cold', 'deceitful', 'dishonest', 'disloyal', 'hateful', 'hostile', 'mean', 'selfish', 
#                         'unfriendly', 'untrustworthy', 'vicious', 'unsociable', 'unprincipled', 'disagreeable', 
#                         'egoistic', 'egotistic', 'unkindly', 'unloving', 'inhumane', 'crooked', 'dishonorable', 
#                         'insincere', 'deceptive', 'thieving', 'corrupt', 'abominable', 'inhospitable', 'ignoble', 
#                         'stingy', 'contemptible', 'inconsiderate', 'self-serving', 'uncongenial', 'uncordial', 
#                         'unneighborly', 'devious', 'evil', 'condemnable', 'malicious', 'unsocial', 'antisocial', 
#                         'ungregarious', 'harsh', 'ill-natured', 'unkind'],
               
#                "Warm": ['warm', 'agreeable', 'dependable', 'reliable', 'friendly', 'good-natured', 'kind', 'nice', 
#                         'sincere', 'honest', 'supportive', 'trustworthy', 'helpful', 'sociable', 'guileless', 
#                         'enthusiastic', 'consistent', 'authentic', 'amicable', 'congenial', 'gracious', 'hospitable', 
#                         'affable', 'neighborly', 'pleasant', 'amiable', 'considerate', 'charitable', 'gentle', 
#                         'kindhearted', 'forgiving', 'good', 'respectable', 'decent', 'polite', 'courteous', 'genuine', 
#                         'earnest', 'honorable', 'unpretentious', 'truthful', 'encouraging', 'accommodating', 
#                         'cooperative', 'extroverted'],
               
#                "Competent": ['able', 'capable', 'competent', 'confident', 'efficient', 'intelligent', 'proficient', 
#                              'qualified', 'skilled', 'skillful', 'smart', 'motivated', 'persistent', 'resourceful', 
#                              'effective', 'self-assured', 'certain', 'businesslike', 'cost-efficient', 'expeditious', 
#                              'streamlined', 'precocious', 'agile', 'brainy', 'bright', 'quick', 'sophisticated', 
#                              'reasonable', 'rational', 'adept', 'technical', 'well-qualified', 'experienced', 
#                              'accomplished', 'delicate', 'sure-handed', 'versatile', 'precise', 'astute', 'streetwise', 
#                              'fastidious', 'driven', 'unforgettable', 'stubborn', 'dogged'],
               
#                "Incompetent": ['dumb', 'foolish', 'helpless', 'ignorant', 'incompetent', 'inefficient', 'inept', 
#                                'clumsy', 'uncertain', 'unintelligent', 'unqualified', 'unskilled', 'disorganized', 
#                                'stupid', 'dense', 'inarticulate', 'asinine', 'unwise', 'powerless', 'hopeless', 
#                                'dependent', 'uneducated', 'uninformed', 'feckless', 'ineffective', 'bungling', 'bad', 
#                                'inadequate', 'incapable', 'awkward', 'maladroit', 'gawky', 'unpredictable', 'unreliable', 
#                                'retarded', 'brainless', 'ineligible', 'quack', 'inexperienced', 'weak', 'unprofessional', 
#                                'amateurish', 'unsystematic', 'chaotic', 'unmethodical'],
               
#                "Foreign": ['foreign', 'alien', 'immigrant', 'extraneous', 'un-american', 'unpatriotic'],
               
#                "Diseased": ['diseased', 'dirty', 'poisonous', 'contagious', 'ill']}

# keywords = ['Asians', 'Whites', 'Jews']
# for key in stereotypes_df.keys():
#     if key in keywords:
#         if key not in stereotypes.keys():
#             stereotypes[key] = []
#         for word in stereotypes_df[key]:
#             word = word.lower()
#             if word not in stereotypes[key]:
#                 stereotypes[key].append(word)
        
# with open("stereotypes.json", "w") as outfile:
#     json.dump(stereotypes, outfile)
                

In [1186]:
# ### Maybe just group positive-negative - use textblob to get polarity
# full = []

# for stereotype in stereotypes.keys():
#     for word in stereotypes[stereotype]:
#         if word not in full:
#             full.append(word)

# positive = get_positive(full)
# negative = get_negative(full)

# revised['negative'] = negative
# revised['positive'] = positive

# # stereotype_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in revised.items() ]))
# # stereotype_df.to_csv("stereotypes.csv", index=False)

# NYT keywords

In [1187]:
# f = open("nyt.json")
# results = json.load(f)
# f.close()

# keywords_china = ['Wuhan (China)', 'China', 'Beijing (China)', 'Communist Party of China', 'Xi Jinping', 'Hubei Province (China)', 
#             'Yunnan Province (China)', 'Anhui (China)', 'Shandong Province (China)', 'Chinese Center for Disease Control and Prevention', 
#             'Shouguang (China)', 'Sichuan Province (China)', 'Shanghai (China)', 'Chinese Centers for Disease Control and Prevention', 
#             'Hainan Island (China)', 'Shenzhen (China)', 'Xinhua', 'Tianjin (China)', 'Zhejiang Province (China)', 
#             "National People's Congress (China)", 'Chengdu (China)', 'Hangzhou (China)', 'Yichang (China)', 
#             'Communist Youth League (China)', 'Zuoling (China)', 'Guangzhou (China)', 'Tibet', 'Xinjiang (China)', 
#             'Beijing News, The', 'Cyberspace Administration of China', "Ministry of Public Security of the People's Republic of China", 
#             'China Daily', 'China Radio International', 'Uighurs (Chinese Ethnic Group)', 'China Central Television', 
#             'Zhang, Wei (Epidemiologist)', 'Lunar New Year', 'Hefei (China)', 'Harbin (China)', 'Henan Province (China)', 
#             'Tiananmen Square (Beijing)', 'Hmong Tribe', 'Guangxi (China)', 'National Bureau of Statistics (China)', 
#             'Xingcheng (China)', 'Hotan (China)', "Ministry of State Security of the People's Republic of China", 
#             'Xuzhou (China)', 'Urumqi (China)', 'Kashgar (China)', 'Hong Kong Protests (2019)', 'Mao Zedong', 
#             'Chinese Academy of Sciences', 'Gansu Province (China)', "People's Bank of China", 'Changmingzhen (China)', 
#             'Wuhan Institute of Virology (China)', 'Chinese-Americans', 'Chinatown (Manhattan, NY)', 
#             'Museum of Chinese in America', 'Far East, South and Southeast Asia and Pacific Areas', 'East Asia', 
#             'Central Asia']

# keywords_asia = ['Wuhan (China)', 'China', 'Beijing (China)', 'Communist Party of China', 'Xi Jinping', 'Hubei Province (China)', 
#             'Yunnan Province (China)', 'Anhui (China)', 'Shandong Province (China)', 'Chinese Center for Disease Control and Prevention', 
#             'Shouguang (China)', 'Sichuan Province (China)', 'Shanghai (China)', 'Chinese Centers for Disease Control and Prevention', 
#             'Hainan Island (China)', 'Shenzhen (China)', 'Xinhua', 'Tianjin (China)', 'Zhejiang Province (China)', 
#             "National People's Congress (China)", 'Chengdu (China)', 'Hangzhou (China)', 'Yichang (China)', 
#             'Communist Youth League (China)', 'Zuoling (China)', 'Guangzhou (China)', 'Tibet', 'Xinjiang (China)', 
#             'Beijing News, The', 'Cyberspace Administration of China', "Ministry of Public Security of the People's Republic of China", 
#             'China Daily', 'China Radio International', 'Uighurs (Chinese Ethnic Group)', 'China Central Television', 
#             'Zhang, Wei (Epidemiologist)', 'Lunar New Year', 'Hefei (China)', 'Harbin (China)', 'Henan Province (China)', 
#             'Taiwan', 'Taipei (Taiwan)', 'Taoyuan (Taiwan)', 'Tiananmen Square (Beijing)', 'Hmong Tribe', 'Guangxi (China)', 
#             'National Bureau of Statistics (China)', 'Xingcheng (China)', 'Hotan (China)', "Ministry of State Security of the People's Republic of China", 
#             'Xuzhou (China)', 'Chinese Nationalist Party (Taiwan)', 'Urumqi (China)', 'Kashgar (China)', 'Hong Kong Protests (2019)', 
#             'Mao Zedong', 'Chinese Academy of Sciences', 'Gansu Province (China)', "People's Bank of China", 'Changmingzhen (China)', 
#             'Wuhan Institute of Virology (China)', 'Chinese-Americans', 'Chinatown (Manhattan, NY)', 'Museum of Chinese in America', 
#             'Asian-Americans', 'Indian-Americans', 'Vietnamese-Americans', 'Asian-Americans (TV Program)', 'Korean-Americans', 
#             'Bangladeshi-Americans', 'Far East, South and Southeast Asia and Pacific Areas', 'Southeast Asia', 'East Asia', 
#             'Central Asia', 'Japan', 'Nara (Japan)', 'Kanazawa (Japan)', 'South Korea', 'Seoul (South Korea)', 
#             'Jeju Island (South Korea)', 'Daegu (South Korea)', 'Thailand', 'Bangkok (Thailand)', 'Chiang Mai (Thailand)', 
#             'Lopburi (Thailand)', 'Phuket (Thailand)', 'Singapore', 'Indonesia', 'Bali (Indonesia)', 'Tomohon (Indonesia)', 
#             'Sulawesi (Indonesia)', 'Java (Indonesia)', 'Surabaya (Indonesia)', 'Maluku Islands (Indonesia)', 'Cambodia', 
#             'Sihanoukville (Cambodia)', 'Myanmar', 'Yangon (Myanmar)', 'Mandalay (Myanmar)', 'Philippines', 'Manila (Philippines)', 
#             'Vietnam', 'Ho Chi Minh City (Vietnam)', 'Cam Ranh Bay (Vietnam)', 'North Korea', 'Kaesong (North Korea)', 
#             'Laos', 'Malaysia', 'Macau', 'Mongolia', 'Nepal', 'Kathmandu (Nepal)', 'Sri Lanka', 'Bangladesh', 'Karachi (Pakistan)', 
#             'Bhutan','India', 'Bharatiya Janata Party', 'New Delhi (India)', 'Kerala (India)', 'Rajasthan (India)', 
#             'Uttar Pradesh State (India)', 'Delhi (India)', 'Mumbai (India)', 'Kashmir and Jammu (India)', 'Jaipur (India)', 
#             'Kashmir Valley (Kashmir and Jammu)',  'Odisha (India)', 'Karnataka (India)', 'Maharashtra (India)', 
#             'Bay of Bengal', 'Gujarat State (India)', 'Kolkata (India)', 'Bihar (India)', 'Srinagar (Jammu and Kashmir)',
#             'AHMEDABAD (INDIA)', 'Himalayas', 'Ladakh (India)', 'Noida (India)', 'Darjeeling (India)', 'Serum Institute of India', 
#             'Punjab (India)', 'Andhra Pradesh (India)', 'Tamil Nadu (India)', 'Tripura (India)', 'Agartala (India)', 
#             'West Bengal (India)', 'Dharamsala (India)', 'Nashik (India)', 'Bhopal (India)', 'Goa (India)', 'Pune (India)', 
#             'Public Health Foundation of India']
        
# keywords_political = ['Trump, Donald J', 'Conservative Political Action Conference', 'Republican Party', 
#                       'Republican National Committee', 'Republican National Convention', 'Democratic Party', 
#                       'Democratic National Committee', 'Democratic National Convention']

# nyt_df = {}
# for result in results.keys():
#     if result not in nyt_df.keys():
#         nyt_df[result] = {}
#         nyt_df[result]['total'] = len(results[result])
#         chinese = 0
#         asian = 0
#         political = 0
#         if len(results[result]) > 0:
#             for article in results[result]:
#                 chinese_article = 0
#                 asian_article = 0
#                 political_article = 0
#                 for keyword in article['keywords']:
#                     if keyword['value'] in keywords_china:
#                         chinese_article += 1
#                     if keyword['value'] in keywords_asia:
#                         asian_article += 1
#                     if keyword['value'] in keywords_political:
#                         political_article += 1
#                 if chinese_article > 0:
#                     chinese += 1
#                 if asian_article > 0:
#                     asian += 1
#                 if political_article > 0:
#                     political += 1
#         nyt_df[result]['chinese'] = chinese
#         nyt_df[result]['asian'] = asian
#         nyt_df[result]['political'] = political

In [1188]:
# df_nyt = pd.DataFrame.from_dict({(i): nyt_df[i] 
#                                  for i in nyt_df.keys()},
#                                 orient='index')
# df_nyt.reset_index(inplace=True)
# df_nyt = df_nyt.rename(columns = {'index':'date'})
# df_nyt = df_nyt.sort_values(by=['date'], ignore_index=True)


# df_nyt.head()

# df_nyt.to_csv("df_nyt.csv", index=False)


# For LIWC

In [1189]:
# # List all files in the BERT folder
# results = glob('../raw/Full/*.csv')
# results.sort()

# f = open("corpus_full.json")
# corpus_full = json.load(f)
# f.close()

# full_raw = {}

# for r in results:
#     day = r[-14:-4]
#     print(day)
#     if day not in full_raw.keys():
#         full_raw[day] = {}
#     full_raw[day]['sentence'] = []
#     full_raw[day]['user'] = []
    
#     df = pd.read_csv(r, lineterminator='\n')
#     df = df[df.lemma_length > 0]
    
#     corpus = df.text.tolist()
#     users = df.author_id.tolist()
    
#     for user, sentence in zip(users, corpus):
#         sentence = preprocess(sentence)
#         full_raw[day]['sentence'].append(sentence)
#         full_raw[day]['user'].append(user)
        

In [1190]:
# full_raw_df = pd.DataFrame.from_dict({(i): full_raw[i]
#                                       for i in full_raw.keys()}, 
#                                      orient="index")

# full_raw_df = full_raw_df.explode(['sentence', 'user'])
# full_raw_df = full_raw_df.reset_index()
# full_raw_df.columns = ["day", "sentence", "user"]

# full_raw_df.head()

In [1191]:
# full_raw_df.to_csv("LIWC_df.csv", index=False)

# After LIWC

In [1192]:
LIWC_df_results = pd.read_csv("LIWC_df_results.csv", lineterminator='\n')

LIWC_df_results.head()

FileNotFoundError: [Errno 2] No such file or directory: 'LIWC_df_results.csv'

In [None]:
pattern = re.compile("^[0-9]+-[0-9]+-[0-9]+$")
index = [bool(pattern.match(string)) for string in LIWC_df_results.day.tolist()]

set(index)

In [None]:
LIWC_df_results = LIWC_df_results[index]
LIWC_df_results.reset_index(drop=True)

LIWC_df_results.head()

In [None]:
LIWC_df_results.to_csv("LIWC_df_results_cleaned.csv", index=False)