### Install Libraries 

In [1]:
pip install emoji --upgrade

Requirement already up-to-date: emoji in /opt/anaconda3/envs/nlu/lib/python3.8/site-packages (0.6.0)
Note: you may need to restart the kernel to use updated packages.


### Import Libraries 

In [2]:
import emoji 
import os
import pandas as pd
import re
from typing import List

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
# Notional example to use Python "emoji" library 
print(emoji.emojize('"Im so sorry about Rick Ross"  :face_with_tears_of_joy:'))

"Im so sorry about Rick Ross"  😂


### Build Dataset 

* Raw dataset can be downloaded from Kaggle : https://www.kaggle.com/rexhaif/emojifydata-en

In [4]:
# List the raw data in the directory "data"
print(os.listdir('./data'))

['tweet_emoji_train_200000.csv', '.DS_Store', 'dev.txt', 'train.txt', 'emojitweets-01-04-2018.txt', 'test.txt']


In [5]:
# Explore the format of the raw data
dev_data_raw = open('./data/dev.txt', 'r') 
dev_data_lines = dev_data_raw.readlines() 

train_data_raw = open('./data/train.txt', 'r') 
train_data_lines = train_data_raw.readlines() 

test_data_raw = open('./data/test.txt', 'r') 
test_data_lines = test_data_raw.readlines() 

count = 0
for line in dev_data_lines[0:40]: 
    print("Line{}: {}".format(count, str(line.strip())))
    count = count + 1

Line0: <START> O
Line1: No O
Line2: object O
Line3: is O
Line4: so O
Line5: beautiful O
Line6: that O
Line7: under O
Line8: certain O
Line9: conditions O
Line10: it O
Line11: will O
Line12: not O
Line13: look O
Line14: ugly O
Line15: Oscar O
Line16: Wilde O
Line17: ↺ O
Line18: RT :red_heart:
Line19: … O
Line20: <STOP> O
Line21: 
Line22: <START> O
Line23: Cant O
Line24: expect O
Line25: different O
Line26: results O
Line27: doing O
Line28: the O
Line29: same O
Line30: thingdoing O
Line31: stuff O
Line32: different O
Line33: from O
Line34: now O
Line35: on :person_shrugging:
Line36: 🏻 O
Line37: ‍ :female_sign:
Line38: ️ O
Line39: <STOP> O


In [6]:
# A few helper functions to parse the raw data and create dataset for dev, train, and test
def get_emoji(tweet:List[str])->List[str]:
    s = ' '.join(tweet)
    if re.findall(r':(.*?):', s):
        return re.findall(r':(.*?):', s)

def remove_emoji(tweet:List[str])->List[str]:
    s = ' '.join(tweet)
    if re.findall(r':(.*?):', s):
        return re.sub(r':(.*?):','', s)
    
def creat_dataset(lines:List[str], sample_size:int=1000)->List[dict]:

    samples = list()
    id = 0
    is_new_tweet = False

    for line in lines:
        if id < sample_size:
            if '<START> O' in line:
                is_new_tweet = True
                sample = dict()
                sample['id'] = id
                sample['tweet'] = list()
                continue
            elif '<STOP> O' in line:
                sample['emoji']= get_emoji(sample['tweet'])
                sample['tweet']= remove_emoji(sample['tweet'])
                samples.append(sample)
                id = id + 1
                is_new_tweet = False
            else:
                _line = re.sub(r'[^a-zA-Z0-9_:-]+','', re.sub(r' O', '', line))
                if _line != '':
                    if is_new_tweet:
                        sample['tweet'].append(_line)
                    else:
                        is_new_tweet = True
                        sample = dict()
                        sample['id'] = id
                        sample['tweet'] = list()
                        sample['tweet'].append(_line)
    return samples 

In [7]:
# A few helper functions to parse the raw data and create dataset for dev, train, and test
def get_emoji_set_from_tweet(tweet:str)->List[str]:
    # ?: lazy match, stop at the first match.
    return set(re.findall(r':(.+?):', tweet))

def remove_emojis_from_tweet(tweet:str)->str:
    return re.sub(r':(.+?):', '', tweet)

def creat_dataset_v2(lines:List[str], sample_size:int=1000)->List[dict]:
    """Converts raw tweet format to a list of dict with id, tweet string, and emoji set as elements.
    
    Input:
        <START> O
        tweet
        ...
        :emoji:
        ...
        <STOP> O

        <START> :emoji:
        tweet
        ...
        <STOP> O
    
    Output:
        [
            {
                id: 0
                tweet: 'the tweet sting'
                emoji: {:emoji_0:, :emoji_1:}
            }
        ]
    """
    SEP = r' O'
    datasets = list()
    id = 0
    tweet_words = list()
    data = dict()
        
    for line in lines:
        if id >= sample_size:
            break
        # Remove spacial characters and separator from line.
        # \w: [a-zA-Z0-9_]
        _line = re.sub(r'[^\w:-]+', '', re.sub(SEP, '', line)).strip()
        if _line:
            tweet_words.append(_line)
        if '<STOP> O' in line:
            # Remove empty charaters at the begining and end.
            tweet = ' '.join(tweet_words).strip()
            # Remove START and STOP string at the begining and end.
            tweet = re.sub('^START\s*', '', re.sub('\s*STOP$', '', tweet))
            data['id'] = id
            data['tweet']= remove_emojis_from_tweet(tweet).strip()
            data['emoji']= get_emoji_set_from_tweet(tweet)
            datasets.append(data)
            # Increase id and reset temporary vars.
            id = id + 1
            tweet_words = list()
            data = dict()

    return datasets 

In [8]:
sample_size = 800000
train_dataset = creat_dataset_v2(train_data_lines, sample_size = sample_size)

In [9]:
df_train = pd.DataFrame(train_dataset)
df_train.set_index('id', inplace=True)

In [10]:
df_train.head()

Unnamed: 0_level_0,tweet,emoji
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,CeeC is going to be another Tboss What is 45 million Naira,{face_with_tears_of_joy}
1,This gif kills me Death is literally gushing towards you and you really gon do a whole 3point turn,{weary_face}
2,LOVE TEST Raw Real JaDine,{purple_heart}
3,i swear we dont gotta look it finds us,{face_with_tears_of_joy}
4,We would like to wish everyone a very Happy New Year and all the best in 2018,{party_popper}


In [11]:
csv_file_name = "./data/tweet_emoji_train_" + str(sample_size) + '.csv'
df_train.to_csv(csv_file_name)