## Text2Emoji ([More info](README.md))

# Clean up dataset
1. Remove redundant adjectives from emoji vocabs (E.g. `color` skin tone, flag: `country`)
    - [Vocab list](processed_data/emoji_vocab.txt)
2. Get [train](processed_data/training_data-1000000_entries.csv) - [test]((processed_data/testing_data-83280_entries.csv)) data
    - Filters non-English reddit comments

In [1]:
import pandas as pd
import re

df = pd.read_csv('data/emojis/emoji_df.csv')
emoji_vocab = set(df['name'].to_list())

# remove skin tone adjective
cleaned_emoji_vocab = set()
for t in emoji_vocab:
    flag_re = r"flag: (.*)"
    is_flag = re.match(flag_re, t)
    if is_flag:
        cleaned_emoji_vocab.add(is_flag[1])
        continue
    details_re = r"(.*): (.*)"
    is_detailed = re.match(details_re, t)
    if is_detailed:
        # print(f"{is_detailed[0]}, {is_detailed[1]}, {is_detailed[2]}")
        cleaned_emoji_vocab.add(is_detailed[1])
        continue
    cleaned_emoji_vocab.add(t)

print(f"Original no. of emojis: {len(emoji_vocab)}")
print(f"cleaned no. of emojis: {len(cleaned_emoji_vocab)}")
print(f"Longest emoji vocab: {max(cleaned_emoji_vocab, key=lambda t: len(t.split(' ')))}")

with open('processed_data/emoji_vocab.txt', 'w') as f:
    for t in sorted(list(cleaned_emoji_vocab)):
        f.write(f"{t}\n")


Original no. of emojis: 3295
cleaned no. of emojis: 1747
Longest emoji vocab: South Georgia & South Sandwich Islands


In [14]:
import sqlite3
import re


sql_conn = sqlite3.connect('data\\archive\\reddit-comments-may-2015\\database.sqlite')
# Full dataset has about 54,000,000 entries

comments = pd.read_sql(
    "SELECT body FROM May2015 LIMIT(1200000)", 
    sql_conn)

def filter_fn(row):
    length = min(100, len(row['body']))
    text = row['body'][:length]
    # Filter non-English-dominant posts
    return len(re.findall(r"[\u0000-\u007F]", text)) > length*0.7

mask = comments.apply(filter_fn, axis=1)
comments = comments[mask]
len(comments)

1192414

In [16]:
data = comments.drop_duplicates()
training_data = data.iloc[:1_000_000]
test_data = data.iloc[1_000_000:]

print(len(training_data))
print(len(test_data))

1000000
83280


In [23]:
# Save mini data to csv
training_data.to_csv(f'processed_data\\training_data-{len(training_data)}_entries.csv')
test_data.to_csv(f'processed_data\\testing_data-{len(test_data)}_entries.csv')

## Experiments with transformers

In [43]:
from transformers import AutoModel, AutoTokenizer

# Note: setnece_tokenizer is same as distilbert_tokenizer
sentence_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/distilbert-base-nli-stsb-mean-tokens")
sentence_transformer = AutoModel.from_pretrained("sentence-transformers/distilbert-base-nli-stsb-mean-tokens")
distilbert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
dsitilbert = AutoModel.from_pretrained('distilbert-base-uncased')


### Check for differences between sentence_tokenizer and distilbert tokenizer

In [42]:
# Check for differences between sentence_tokenizer and distilbert tokenizer
all_same = True
for i in range(100):
    if sentence_tokenizer.encode(comments['body'].iloc[0]) != distilbert_tokenizer.encode(comments['body'].iloc[0]):
        all_same = False

assert all_same

### testing

In [62]:
import random

for i in random.sample(range(len(comments)), 5):
    print(f"{i}. {comments['body'].iloc[i]}\n")

921. 100000 soldiers managed to kill only 800 Romans, proper embarrassment

1764. I'd rather not say.

4208. Awesome. Any specific reason for the power supply and processor picks?

7583. BOOOOOO THAT FILTHY SWINE

6344. They're/Their/There

