In [1]:
import pandas as pd
import json
import pickle
from scipy.stats import spearmanr

In [2]:
def clean_json_emoji(string):
    #
    string = ASCII_to_emoji('\\U' + (string.lstrip('\\\\U')).lower())
    return string
        
def extract_pairs(emosim_json):
    # Converts EmoSim508 json file string to list of tuples: (emoji1, emoji2, similarity)
    # Returns 2 dictionaries
    # A dictionary 1 entry looks like this: 
    # emoji1 : [(emoji2, similarity),(other pairs),...]
    # A dictionary 2 entry looks like this:
    # (emoji1,emoji2) : EmoSim508_score
    pairs = {}
    pair_score_dict = {}
    for pair in emosim_json:
        emoji1 = pair['emojiPair']['emojiOne']['unicodelong']
        emoji2 = pair['emojiPair']['emojiTwo']['unicodelong']
        emoji1 = clean_json_emoji(emoji1)
        emoji2 = clean_json_emoji(emoji2)
        similarity = pair['emojiPairSimilarity']['Human_Annotator_Agreement']
        pair_score_dict[(emoji1,emoji2)] = similarity
        if emoji1 not in pairs: 
            pairs[emoji1] = [(emoji2,similarity)]
        else: 
            pairs[emoji1].append((emoji2,similarity))
        if emoji2 not in pairs: 
            pairs[emoji2] = [(emoji1,similarity)]
        else: 
            pairs[emoji2].append((emoji1,similarity))

    #Sort the rankings in the dictionary
    for emoji in pairs.keys():
        similar_emojis = pairs[emoji]
        sorted_emojis = sorted(similar_emojis, reverse=True,key=lambda tup: tup[1])
        pairs[emoji] =  sorted_emojis 

    return pairs, pair_score_dict    

def emoji_to_ASCII(string):
    # example input: '😀hi'
    # example output: '\\U0001f600hi'
    return string.encode('unicode-escape').decode('ASCII')

def ASCII_to_emoji(string):
    # example input: '\\U0001f600hi'
    # example output: '😀hi'
    try: 
        return string.encode('ASCII').decode('unicode-escape')
    except:
        return string

<h1><b>Data preparation</b></h1>

In [5]:
emosim_data = json.load(open('emoji_datasets/EmoSim508.json'))

In [6]:
#Dictionaries returned by extract_pairs()
emosim_rankings, emosim_pairs = extract_pairs(emosim_data)

#Emojis that occur in >10 pairs
common_emojis = [emoji for emoji,rankings  in emosim_rankings.items() if len(rankings) > 10] 
common_emojis_pretty = [ASCII_to_emoji(emoji) for emoji,rankings  in emosim_rankings.items() if len(rankings) > 10]

#>10 pairs emojis, but with their corresponding lists of paired emojis
# top_rankings = {ASCII_to_emoji(emoji):[(ASCII_to_emoji(emoji),value) for emoji,value in emosim_rankings[emoji]] for emoji in common_emojis}

# The PMI cosine similarity rankings provided in emoji_project.ipynb
PMI_rankings = pickle.load(open('./pickle_files/PMI_rankings.p','rb'))

#pair version of the PMI matrix (same format as emosim_pairs)
PMI_pairs = pickle.load(open('./pickle_files/PMI_tuples.p','rb'))
PMI_rankings = {emoji:[((emoji),value) for emoji,value in PMI_rankings[emoji]] for emoji in PMI_rankings.keys()}

In [7]:
def filter_rankings(all_emosim_rankings, all_PMI_rankings, target_emojis, truncate=None,include_ratings=False):
    '''
    input format for emosim_rankings and PMI_rankings:
    (target_emoji, [(emoji1,score1),(emoji2,score2),...]) Emojis are in unicode (example: '\\U0001F3B6')
    emosim_rankings will have variable length lists.
    PMI_rankings will have fixed 365 length lists (rank all emoji pairs)
    Idea 1: Filter the PMI list for a given emoji to be equal to the emosim list for that emoji (We did this one)
    Idea 2: Idea 1, but only for the top 10 emojis of the emosim list.
    Idea 3: Find a general set of emojis to use across all aligned lists (hard)
    Return: dictionary of the filtered emosim_rankings and PMI_rankings such that only the overlapping emojis are included.
    '''

    filtered_PMI = {emoji:[] for emoji in target_emojis}
    filtered_emosim = {emoji:[] for emoji in target_emojis}
    for target_emoji in target_emojis:
        if target_emoji in all_PMI_rankings:
            emosim_emojis = set(list(zip(*all_emosim_rankings[target_emoji]))[0])
            PMI_emojis = set(list(zip(*all_PMI_rankings[target_emoji]))[0])
            overlap = emosim_emojis.intersection(PMI_emojis)
            
            PMI_rankings = all_PMI_rankings[target_emoji]
            emosim_rankings = all_emosim_rankings[target_emoji]

            filtered_PMI[target_emoji] = sorted([(emoji,ranking) for emoji,ranking in PMI_rankings if emoji in overlap],
                                                reverse=True,key=lambda tup: tup[1])
            filtered_emosim[target_emoji] = sorted([(emoji,ranking) for emoji,ranking in emosim_rankings if emoji in overlap],
                                                   reverse=True,key=lambda tup: tup[1])
            
            if truncate != None: 
                filtered_PMI = {emoji : filtered_PMI[emoji][:truncate] for emoji in filtered_PMI.keys()}
                filtered_emosim = {emoji : filered_emosim[emoji][:truncate] for emoji in filered_emosim.keys()}
    
    if not include_ratings:
        filtered_PMI = {ASCII_to_emoji(emoji):[(ASCII_to_emoji(emoji)) for emoji,value in filtered_PMI[emoji]] for emoji in filtered_PMI.keys()}
        filtered_emosim = {ASCII_to_emoji(emoji):[(ASCII_to_emoji(emoji)) for emoji,value in filtered_emosim[emoji]] for emoji in filtered_emosim.keys()}
        return filtered_PMI, filtered_emosim
    else:
        return filtered_emosim,filtered_PMI

def emosim_spearman(filtered_emosim_rankings, filtered_PMI_rankings):
    # Input must be dictionaries with the ranked lists of emoji chars (without the ratings).
    # Returns dictionary of the following format 
    # emoji : spearman_score 
    # Where emoji is an emoji and spearman score is the spearmanr score of emosim and PMI matrix ranking lists
    # which rank similarity of the emoji with the other emojis.
    target_emojis = filtered_emosim_rankings.keys()
    spearman_scores = {target_emoji : None for target_emoji in target_emojis}
    for emoji in target_emojis:
        emosim_ranking = filtered_emosim_rankings[emoji]
        PMI_ranking = filtered_PMI_rankings[emoji]
        spearman_scores[emoji] = spearmanr(emosim_ranking,PMI_ranking)
    return spearman_scores

def sorted_pair_rankings(emosim_tuples_dict, PMI_tuples_dict, include_score=False):
    # Inputs are the 2 dictionaries of emoji pair tuples entries and similarity score values
    # Outputs 2 respective lists of emoji pair tuples sorted in descending order of similarity
    # include_score set to True will include the similarity score as a third index of each pair's tuple. 
    PMI_tuples_dict = {pair : value for pair,value in PMI_tuples_dict.items() if pair in emosim_tuples_dict}
    
    if include_score:
        sorted_emosim_pairs = sorted([((pair),ranking) for pair,ranking in emosim_tuples_dict.items() if pair in PMI_tuples_dict],
                                                   reverse=True,key=lambda tup: tup[1])
        sorted_PMI_pairs = sorted([((pair),ranking) for pair,ranking in PMI_tuples_dict.items()],
                                                       reverse=True,key=lambda tup: tup[1])
        return sorted_emosim_pairs,sorted_PMI_pairs

    sorted_emosim_pairs = list(zip(*sorted([(str(pair),ranking) for pair,ranking in emosim_tuples_dict.items() if pair in PMI_tuples_dict],
                                                   reverse=True,key=lambda tup: tup[1])))[0]
    sorted_PMI_pairs = list(zip(*sorted([(str(pair),ranking) for pair,ranking in PMI_tuples_dict.items()],
                                                   reverse=True,key=lambda tup: tup[1])))[0]
    
    return sorted_emosim_pairs,sorted_PMI_pairs
    

<b><h1>Spearman Ranking</h1></b>

In [8]:
#emosim and pmi dictionary lists filtered so that only overlapping emojis are included
filtered_emosim,filtered_PMI = filter_rankings(emosim_rankings,PMI_rankings,common_emojis, include_ratings=False)

#emosim and pmi ranked lists of emoji pair tuples (no scores). 
sorted_emosim_pairs,sorted_PMI_pairs = sorted_pair_rankings(emosim_pairs,PMI_pairs)
#emosim and pmi ranked lists of emoji pair tuples (scores included in third index). 
sorted_emosim_scores,sorted_PMI_scores = sorted_pair_rankings(emosim_pairs,PMI_pairs,include_score=True) 

In [32]:
for emoji,value in filtered_PMI.items():
    print(emoji,len(value))

🎶 10
🎉 19
😊 24
❤ 47
💕 30
😍 53
💙 16
💖 16
😂 62
😘 32
😭 23
😩 11
👍 23
👌 12
✨ 24
👉 27
👏 14
🙈 12
💯 29
🙌 14
🙏 16
🔥 25
👇 11


In [39]:
#Top 10 most similar pairs in EmoSim508 that overlap between the two sets of pairs.
sorted_emosim_scores

[(('🎵', '🎶'), 4.0),
 (('🎊', '🎉'), 3.95),
 (('☺', '😊'), 3.9),
 (('❤', '💞'), 3.85),
 (('💕', '❤'), 3.85),
 (('💞', '💕'), 3.85),
 (('💘', '💕'), 3.85),
 (('💕', '😍'), 3.8),
 (('💜', '💙'), 3.8),
 (('💗', '❤'), 3.8),
 (('💗', '💕'), 3.8),
 (('❤', '💓'), 3.8),
 (('🎤', '🎶'), 3.75),
 (('💖', '💕'), 3.75),
 (('💕', '💓'), 3.75),
 (('❤', '💙'), 3.7),
 (('💙', '💚'), 3.7),
 (('❤', '💜'), 3.7),
 (('💛', '💚'), 3.7),
 (('❤', '💖'), 3.7),
 (('❤', '💚'), 3.7),
 (('💛', '❤'), 3.7),
 (('💜', '💚'), 3.7),
 (('💛', '💜'), 3.7),
 (('💞', '💖'), 3.7),
 (('💗', '💖'), 3.65),
 (('🎧', '🎶'), 3.6),
 (('😍', '💖'), 3.55),
 (('💕', '💜'), 3.55),
 (('🎼', '🎶'), 3.55),
 (('❤', '😍'), 3.5),
 (('😍', '💞'), 3.5),
 (('🎧', '🎤'), 3.45),
 (('💘', '❤'), 3.45),
 (('🎈', '🎉'), 3.4),
 (('😅', '😂'), 3.4),
 (('💙', '💕'), 3.4),
 (('😍', '😘'), 3.4),
 (('😁', '😂'), 3.4),
 (('😭', '😩'), 3.35),
 (('💖', '😘'), 3.35),
 (('👍', '👌'), 3.35),
 (('😫', '😭'), 3.35),
 (('💙', '💛'), 3.35),
 (('😢', '😭'), 3.3),
 (('😘', '💋'), 3.3),
 (('💕', '😘'), 3.25),
 (('🎉', '🎁'), 3.2),
 (('💗', '😍'), 3.2),


In [36]:
#Top 10 most similar pairs in the PMI matrix that overlap between the two sets of pairs.
sorted_PMI_scores

[(('👉', '👈'), 0.036137509067646276),
 (('🙉', '👣'), 0.03581294829518604),
 (('👅', '💦'), 0.02584627642652371),
 (('🎧', '🎤'), 0.023046151039656567),
 (('🎂', '🎈'), 0.020556442745047393),
 (('🎈', '🎊'), 0.01955404145093862),
 (('🎁', '🎂'), 0.01932779322456689),
 (('🎈', '🎁'), 0.018330965842919433),
 (('🏆', '⚽'), 0.01761134677625526),
 (('👉', '🎥'), 0.011225910794570502),
 (('💛', '💚'), 0.010026107669210263),
 (('👋', '👊'), 0.00848841249968659),
 (('📷', '©'), 0.006619109356580569),
 (('💸', '💃'), 0.005719465672955916),
 (('🙊', '🙉'), 0.005476327813462278),
 (('🎈', '🎉'), 0.005154347722247269),
 (('💦', '🍆'), 0.004536749521573066),
 (('👉', '💞'), 0.003878208745781895),
 (('💫', '✨'), 0.003701701662619322),
 (('🎥', '👍'), 0.003616929279733717),
 (('💦', '👉'), 0.0033866611946148776),
 (('🎥', '‼'), 0.003210639005089327),
 (('💖', '👉'), 0.0031185077532060743),
 (('📢', '👣'), 0.0030511740627372722),
 (('🎊', '🎉'), 0.0029218929342181405),
 (('👍', '👉'), 0.0026798968273709237),
 (('☝', '😳'), 0.002679561648786506),
 (

In [11]:
emoji_spearman_scores = emosim_spearman(filtered_emosim,filtered_PMI)
tuple_spearman_scores = spearmanr(sorted_emosim_pairs,sorted_PMI_pairs)



<b><h2>Spearman Rank-Order Correlation between the pair rankings </h></b>

In [12]:
tuple_spearman_scores

SpearmanrResult(correlation=0.051822307788007414, pvalue=0.28138769732961466)

<h2><b>Spearman Rank-Order Correlation for individual emojis</b></h2>

In [40]:
emoji_spearman_scores

{'✨': SpearmanrResult(correlation=0.18608695652173912, pvalue=0.38396703397174836),
 '❤': SpearmanrResult(correlation=-0.10649861239592968, pvalue=0.47616292574023333),
 '🎉': SpearmanrResult(correlation=0.5614035087719299, pvalue=0.012380476816162053),
 '🎶': SpearmanrResult(correlation=-0.406060606060606, pvalue=0.24428229408662638),
 '👇': SpearmanrResult(correlation=-0.5, pvalue=0.11730680301423815),
 '👉': SpearmanrResult(correlation=-0.007326007326007325, pvalue=0.9710701433802329),
 '👌': SpearmanrResult(correlation=0.5734265734265735, pvalue=0.05126558978649144),
 '👍': SpearmanrResult(correlation=-0.025691699604743084, pvalue=0.9073666997030373),
 '👏': SpearmanrResult(correlation=-0.01978021978021978, pvalue=0.9464892274080114),
 '💕': SpearmanrResult(correlation=0.19555061179087874, pvalue=0.3003868597964004),
 '💖': SpearmanrResult(correlation=-0.2676470588235294, pvalue=0.3162544185483094),
 '💙': SpearmanrResult(correlation=-0.2676470588235294, pvalue=0.3162544185483094),
 '💯': Spe