In [48]:
import pandas as pd
import numpy as np
import emoji
import string
from nltk.tokenize import TweetTokenizer
from collections import defaultdict
from scipy.sparse import coo_matrix
from scipy.sparse import csc_matrix
from collections import Counter
from math import log
from pprint import pformat
from scipy.sparse.linalg import svds
from sklearn.preprocessing import normalize

In [2]:
# load preformatted and semi-preprocessed dataset
tweets_and_labels_RAW = pd.read_csv('emoji_datasets/all_data.csv', header=None, encoding='utf-8')

In [3]:
def separate_emojis(example):
    result = []
    ptr = 0
    for i,c in enumerate(example):
        if c in emoji.UNICODE_EMOJI:
            split = example[ptr:i]
            if split != '':
                result.append(split)
                result.append(c)
            else:
                result.append(c)
            ptr = i+1
    return result

def preprocess(data):
    labels = list(data[:,0])
    tweets = list(data[:,1])
    result = []
    tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    for i,twt in enumerate(tweets):
        clean_tokens = []
        tokens = tweet_tokenizer.tokenize(twt)
        for j,tk in enumerate(tokens):
            tk = tk.lower()
            sep = separate_emojis(tk)
            if sep != []:
                clean_tokens = clean_tokens + sep
            else:
                clean_tokens.append(tk)
        result.append((labels[i], clean_tokens))
    return result

def find_all_emojis(data):
    emoji_dict = defaultdict(int)
    for twt in data:
        for word in twt[1]:
            if word in emoji.UNICODE_EMOJI:
                emoji_dict[word] += 1
    return emoji_dict

def term_context_matrix(targets, data):
    tc_matrix = defaultdict(lambda: defaultdict(lambda: 0))
    for twt in data:
        for w1 in targets:
            if w1 in twt[1]:
                for w2 in twt[1]:
                    tc_matrix[w1][w2]+=1
    return tc_matrix

def vocab_map(dd):
    vocab = {}
    vocab_id = 0
    for k,v in dd.items():
        for k2,v2 in v.items():
            if k2 not in vocab.keys():
                vocab[k2] = vocab_id
                vocab_id += 1
    return vocab

def term_to_int_dd(dd):
    num_rows = len(dd.keys())
    data = defaultdict(int)
    vocab_dict = vocab_map(dd)
    for i,r in enumerate(dd.keys()):
        for j,c in enumerate(dd[r].keys()):
            data[i,vocab_dict[c]] = dd[r][c]
            
    return data

def term_to_sparse(dd):
    dd_int = term_to_int_dd(dd)
    vs = [v for (i,j), v in dd_int.items()]
    ii = [i for (i,j), v in dd_int.items()]
    jj = [j for (i,j), v in dd_int.items()]
    matrix = coo_matrix((vs, (ii, jj)))
    return matrix



In [4]:
tweets_and_labels = preprocess(tweets_and_labels_RAW.values)
emoji_counts = find_all_emojis(tweets_and_labels)
emoji_targets = list(emoji_counts.keys())

In [5]:
term_matrix = term_context_matrix(emoji_targets, tweets_and_labels)

In [6]:
emoji_coo_matrix = term_to_sparse(term_matrix)

In [7]:
def unigram_bigram_counts(targets, tweets, term_matrix):
    target_counter = Counter()
    vocab_counter = Counter()
    bigram_counter = Counter()
    for twt in tweets:
        for word in twt[1]:
            if word in targets:
                target_counter[word] += 1
            vocab_counter[word] += 1
    
    for emoji in term_matrix.keys():
        for word in term_matrix[emoji].keys():
            bigram_counter[(emoji,word)] = term_matrix[emoji][word]
            
    return target_counter, vocab_counter, bigram_counter


def unigram_index_maps(counter):
    val_to_index, index_to_val = {}, {}
    for i, x in enumerate(counter.keys()):
        val_to_index[x] = i
        index_to_val[i] = x
    return val_to_index, index_to_val


def pmi_matrix(params):
    '''
        tc   : target_counts
        vc   : vocab_counts
        bc   : bigram_counts
        
        tcs  : target_count_sum
        vcs  : vocab_count_sum
        bcs  : bigram_count_sum
        
        tv2i : target_v2i  (target value to index)
        ti2v : target_i2v  (index to target value)
        vv2i : vocab_v2i   (vocab value to index)
        vi2v : vocab_i2v   (index to vocab value)
    '''
    
    tc   = params['tc']
    vc   = params['vc']
    bc   = params['bc']
    tcs  = params['tcs']
    vcs  = params['vcs']
    bcs  = params['bcs']
    tv2i = params['tv2i']
    ti2v = params['ti2v']
    vv2i = params['vv2i']
    vi2v = params['vi2v']
    
    pmi_samples = Counter()
    data = []
    rows = []
    cols = []
    for (x, y), n in bc.items():
        rows.append(tv2i[x]) # target index
        cols.append(vv2i[y]) # context index
        data.append(log((n / bcs) / (((tc[x] / tcs)) * ((vc[y] / vcs)**0.75))))
        #data.append(log( (( (n / bcs)**0.75 ) / (((tc[x] / tcs)**0.75) * (vc[y] / vcs)))**0.75 ))
        pmi_samples[(x, y)] = data[-1]
        
    pmi_matrix = csc_matrix((data, (rows, cols)))
    
    return pmi_matrix, pmi_samples 


def prune_counts(tc_RAW, vc_RAW, bc_RAW):
    '''
        Remove target and vocab words which occur less than 5 times
        Replace with UNK word
    '''
    tc = tc_RAW.copy()
    vc = vc_RAW.copy()
    bc = bc_RAW.copy()
    
    min_occurence = 5
    unk = 'UNK'
    for wt in list(tc.keys()):
        if tc[wt] < min_occurence:
            count = tc[wt]
            del tc[wt]
            tc[unk] += count
    for wv in list(vc.keys()):
        if vc[wv] < min_occurence:
            count = vc[wv]
            del vc[wv]
            vc[unk] += count
    for x,y in list(bc.keys()):
        if x not in tc and y not in vc:
            count = bc[(x, y)]
            del bc[(x, y)]
            bc[(unk,unk)] += count
        elif x not in tc:
            count = bc[(x, y)]
            del bc[(x, y)]
            bc[(unk,y)] += count
        elif y not in vc:
            count = bc[(x, y)]
            del bc[(x, y)]
            bc[(x,unk)] += count
            
    return tc, vc, bc
        


In [8]:

target_counts_RAW, vocab_counts_RAW, bigram_counts_RAW = unigram_bigram_counts(emoji_targets, tweets_and_labels, term_matrix)


In [9]:

target_counts, vocab_counts, bigram_counts = prune_counts(target_counts_RAW, vocab_counts_RAW, bigram_counts_RAW)


In [10]:
print(target_counts['UNK'], vocab_counts['UNK'], bigram_counts['UNK', 'UNK'])

505 27523 2473


In [11]:

target_count_sum = sum(target_counts.values())
vocab_count_sum = sum(vocab_counts.values())
bigram_count_sum = sum(bigram_counts.values())


In [12]:

target_v2i, target_i2v = unigram_index_maps(target_counts)
vocab_v2i, vocab_i2v = unigram_index_maps(vocab_counts)


In [13]:

pmi_params = {
    'tc'    : target_counts,
    'vc'    : vocab_counts,
    'bc'    : bigram_counts,
    'tcs'   : target_count_sum,
    'vcs'   : vocab_count_sum,
    'bcs'   : bigram_count_sum,
    'tv2i'  : target_v2i,
    'ti2v'  : target_i2v,
    'vv2i'  : vocab_v2i,
    'vi2v'  : vocab_i2v
}


In [14]:
pmi_matrix, pmi_samples = pmi_matrix(pmi_params)

In [376]:
def factorize_pmi_matrix(pmi, t_i2v, t_v2i, v_i2v, v_v2i):
    '''
        pmi_matrix : the PMI matrix
        i2v : target_v2i
        v2i : target_i2v
        
        Computes U, sigma, Vh using SVD
        Computes W_svd = U dot sigma^0.5
        Normalizes W_svd:
            rendering cosine similarity equivalent to dot product
            
    '''
    # Factorize the PMI matrix
    # k : number of singular values and vectors to compute
    U, sigma, Vh = svds(pmi, k=150)
    sigma_p = sigma**0.5
    
    # compute W_svd
    W_svd = U*sigma_p
    
    # Normalize the vectors to enable computing cosine similarity    
    norms = np.sqrt(np.sum(np.square(W_svd), axis=1, keepdims=True))
    W_svd_n = U / np.maximum(norms, 1e-7)
    
    sample_emojis = list(t_v2i.keys())
    k = 5
    for x in sample_emojis:
        '''
            Cosine similarity for this unigram against all others
        '''
        dd = np.dot(W_svd_n, W_svd_n[t_v2i[x]])
          
        s = ''
        # Get the list of nearest neighbor descriptions.
        for i in np.argsort(dd)[-(k+1):]:
            s += '(%s, %f) ' % (t_i2v[i], dd[i])
        print('%s, %d\n %s' % (x, target_counts[x], s))
        print()
        print('-' * 20)
        print()
    
    

In [None]:
factorize_pmi_matrix(pmi_matrix, target_i2v, target_v2i, vocab_i2v, vocab_v2i)

In [None]:
print(pmi_matrix.count_nonzero())
print()
print(pmi_matrix.shape)
print()
print(pformat(pmi_samples.most_common()[:1000]))


In [375]:
'''
EXAMPLE GOOD RESULTS

Positive correlations:

(('🙏', '😥'), 3.6183627184387683)
('😯', '😮'), 3.066208954490881)
(('🍜', '🍱'), 3.463074333065022)
(('😜', '😝'), 3.619660577154368)
(('🔥', '💵'), 3.6269598796359794)
(('🍺', '😝'), 3.5326492001647383)
(('🍺', '🍷'), 5.924267709047791)

Negative correlations:

(('😍', '💔'), -2.038606008309401)
(('😘', 'hate'), -3.398034123371635))
(('🎶', '😩'), -3.481518571565408)

'''

"\nEXAMPLE GOOD RESULTS\n\nPositive correlations:\n\n(('🙏', '😥'), 3.6183627184387683)\n('😯', '😮'), 3.066208954490881)\n(('🍜', '🍱'), 3.463074333065022)\n(('😜', '😝'), 3.619660577154368)\n(('🔥', '💵'), 3.6269598796359794)\n(('🍺', '😝'), 3.5326492001647383)\n(('🍺', '🍷'), 5.924267709047791)\n\nNegative correlations:\n\n(('😍', '💔'), -2.038606008309401)\n(('😘', 'hate'), -3.398034123371635))\n(('🎶', '😩'), -3.481518571565408)\n\n"