In [123]:
#Compute disparity between word-pairs identified for each entity reference

from nltk.corpus import wordnet as wn
import spacy
import sys
import math

sys.stdout.write('Loading SpaCy model...\n')
lg = spacy.load('en_core_web_lg')
sys.stdout.write('Loaded.\n')

Loading SpaCy model...
Loaded.


In [132]:
def get_pickle_data(filepath):
    
    import pickle
    ent_clusters = None
    with open(filepath, 'rb') as f:
        ent_clusters = pickle.load(f)
    return ent_clusters

def save_pickle_data(filepath, data):
    
    with open(filepath, 'wb') as f:
        pickle.dump(data, f)

def merge_case_sensitive_keys(d):
    
    d2 = dict()
    for key in d:
        key_low = key.lower()
        if(key_low in d2):
            d2[key_low] += d[key]        
        else:
            d2[key_low] = d[key]
            
    return d2

def process_dict(d):

    d2 = dict()
    for key in d:
        d[key]['adjectives'] = merge_case_sensitive_keys(d[key]['adjectives'])
        
    return d

def get_candidate_adj_counts(ent_adj_media_1, ent_adj_media_2, entity, MAX_FREQUENCY, MIN_FREQUENCY):
    
    media_1_adj_freq = ent_adj_media_1[entity]['adjectives']
    media_2_adj_freq = ent_adj_media_2[entity]['adjectives']
    
    combined_adj_freq = media_1_adj_freq.copy()
    for word in media_2_adj_freq:
        if word in combined_adj_freq:
            combined_adj_freq[word] += media_2_adj_freq[word]
        else:
            combined_adj_freq[word] = media_2_adj_freq[word]

    # keep words based on frequency
    candidate_adj_freq_dict = {}
    for word in combined_adj_freq:
        if combined_adj_freq[word] >= MIN_FREQUENCY and combined_adj_freq[word] <= MAX_FREQUENCY:
            candidate_adj_freq_dict[word] = combined_adj_freq[word]
    
    return candidate_adj_freq_dict, media_1_adj_freq, media_2_adj_freq

def get_word_synonym_pairs(word_list):
    word_pairs = []
    for word in word_list:
        for syn in wn.synsets(word):
            for l in syn.lemmas():
                if l.name() == word:
                    continue
                if l.name() in word_list:
                    word_pairs.append(sorted([word, l.name()]))

    word_pairs = [t for t in (set(tuple(wp) for wp in word_pairs))] 

    return word_pairs

def get_disparity(word_pairs, positive_counts, negative_counts, disparity_threshold):
    
    tool = []
    for word1, word2 in word_pairs:
        f_pos_1 = float(positive_counts[word1]) if word1 in positive_counts else 0.1
        f_pos_2 = float(positive_counts[word2]) if word2 in positive_counts else 0.1
        f_neg_1 = float(negative_counts[word1]) if word1 in negative_counts else 0.1
        f_neg_2 = float(negative_counts[word2]) if word2 in negative_counts else 0.1

        disparity1 = f_pos_1 / f_neg_1
        disparity2 = f_neg_2 / f_pos_2
        disparity = f_pos_1 * f_neg_2 / f_neg_1 / f_pos_2

        if disparity < 1.0:
            # word 1 is negative
            tool.append((word2, 1.0/disparity2, word1, 1.0/disparity1))
        else:
            tool.append((word1, disparity1, word2, disparity2))

    tool.sort(key=lambda x: x[1]*x[3], reverse=True)

    final_pairs = []
    for tp in tool:
        if tp[1]>=disparity_threshold and tp[3]>=disparity_threshold:
            final_pairs.append(tp)
            
    return final_pairs
    
def get_result_pairs(media_1_file, media_2_file, max_freq, min_freq, disparity_threshold):

    entity_adj_pairs_media_1 = get_pickle_data(media_1_file)
    entity_adj_pairs_media_2 = get_pickle_data(media_2_file)
    
    entity_adj_pairs_media_1 = process_dict(entity_adj_pairs_media_1)
    entity_adj_pairs_media_2 = process_dict(entity_adj_pairs_media_2)
    
    entities1 = list(entity_adj_pairs_media_1.keys())
    entities2 = list(entity_adj_pairs_media_2.keys())
    combined_entities = list(set(entities1) & set(entities2)) 

    print("Number of entities in Media 1 ", len(entities1))
    print("Number of entities in Media 2 ", len(entities2))
    print("Number of entities common in both ", len(combined_entities))

    ent_word_pairs = dict()
    for entity in combined_entities:
        cluster_ent = entity_adj_pairs_media_1[entity]['person_entities']
        candidate_adj_freq_dict, media_1_adj_freq, media_2_adj_freq = get_candidate_adj_counts(entity_adj_pairs_media_1, entity_adj_pairs_media_2, entity, MAX_FREQUENCY, MIN_FREQUENCY)
        if(len(candidate_adj_freq_dict) > 25):
            word_list = list(candidate_adj_freq_dict.keys())
            word_pairs = get_word_synonym_pairs(word_list)
            final_pairs = get_disparity(word_pairs, media_1_adj_freq, media_2_adj_freq, disparity_threshold)
            ent_word_pairs[entity] = {'adj_pairs':final_pairs, 'entity_cluster':cluster_ent}
            
    return ent_word_pairs

def get_entity_results(word_pair_dict, entity):
    
    ent_adj_pair = dict()    
    for key in word_pair_dict:
        entity_cls = word_pair_dict[key]['entity_cluster']
        if(any(True for ent in entity_cls if entity in ent)):
            ent_adj_pair[key] = word_pair_dict[key]
        
    return ent_adj_pair

# l = {2:{'per':"amanul", 'adjectives': {"A":1, "a":3, "Ca": 3, "ca":1}}, 3:{'per':"aamanul", 'adjectives': {"A":4, "a":3, "Ca": 9, "ca":1}}}
# print(process_dict(l))

In [134]:
max_freq = 500 # out of 10000
min_freq = 2 # out of 10000
disparity_threshold = 1.0

media_1_file = "Nammed_entities/ent_adj_pairs_Breitbart.pickle"
media_2_file = "Nammed_entities/ent_adj_pairs_NYT.pickle"
media_3_file = "Nammed_entities/ent_adj_pairs_BI.pickle"
media_4_file = "Nammed_entities/ent_adj_pairs_CNN.pickle"

ent_word_pairs = get_result_pairs(media_1_file, media_2_file, max_freq, min_freq, disparity_threshold)

ent_adj_p = get_entity_results(ent_word_pairs, "Trump")
print(len(ent_adj_p[1045]['adj_pairs']))
ent_adj_p

Number of entities in Media 1  1015
Number of entities in Media 2  1393
Number of entities common in both  693
546


{146: {'adj_pairs': [], 'entity_cluster': ['Eric Trump']},
 519: {'adj_pairs': [('young', 2.0, 'new', 1.0)],
  'entity_cluster': ['Melania Trump']},
 1045: {'adj_pairs': [('upcoming', 340.0, 'coming', 30.0),
   ('deplorable', 110.0, 'reprehensible', 40.0),
   ('alien', 110.0, 'alienated', 40.0),
   ('alien', 110.0, 'exotic', 20.0),
   ('exact', 109.99999999999999, 'accurate', 20.0),
   ('cowardly', 40.0, 'fearful', 50.0),
   ('despicable', 90.0, 'worthless', 20.0),
   ('greedy', 30.0, 'avid', 40.0),
   ('wild', 2.0, 'risky', 589.9999999999999),
   ('speculative', 2.0, 'risky', 589.9999999999999),
   ('southeast', 20.0, 'southeastern', 50.0),
   ('sublime', 20.0, 'lofty', 40.0),
   ('sorry', 40.0, 'regretful', 20.0),
   ('tantamount', 20.0, 'equivalent', 40.0),
   ('eminent', 20.0, 'lofty', 40.0),
   ('bad', 1.024390243902439, 'risky', 590.0),
   ('working', 30.0, 'functional', 20.0),
   ('fictional', 30.0, 'fabricated', 20.0),
   ('slippery', 20.0, 'tricky', 30.0),
   ('transnational',