In [1]:
import pandas as pd
import re
import emoji

In [2]:
sentiment140 = pd.read_csv('data/training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
sentiment140.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

print(f"Original dataset size: {sentiment140.shape}")

Original dataset size: (1600000, 6)


In [3]:
emoticon_mapping = {
    r"[:=]\s*\)+": "🙂",  # handles :), =), :)), =)), : ), = )), etc.
    r"[:=]\s*\(+": "🙁",  # handles :(, = (, :((, =((, etc.
    r"[:=]D": "😄",         # handles :D, =D
    r"[:=]\s*D": "😄",       # handles : D, = D 
    r"[:=]P": "😛",         # handles :P, =P
    r"[:=]\s*P": "😛",       # handles : P, = P
    r"[:=]O": "😮",         # handles :O, =O
    r"[:=]\s*O": "😮",       # handles : O, = O
    r";\)": "😉",             # handles ;)
    r";\s*\)": "😉",         # handles ; )
}

def replace_emoticons(text):
    for emoticon_pattern, emoji_char in emoticon_mapping.items():
        text = re.sub(emoticon_pattern, emoji_char, text)
    return text

In [4]:
sentiment140['text'] = sentiment140['text'].apply(replace_emoticons)

In [6]:
def contains_emoji(text):
    for char in text:
        if char in emoji.EMOJI_DATA:
            return True
    return False

emoji_rows = sentiment140[sentiment140['text'].apply(contains_emoji)]
non_emoji_rows = sentiment140[~sentiment140['text'].apply(contains_emoji)]

print(f"Rows with emojis after replacement: {len(emoji_rows)}")

Rows with emojis after replacement: 13539


In [14]:
needed_non_emoji_rows = max(0, 50000 - len(emoji_rows))

if needed_non_emoji_rows > 0:
    sampled_non_emoji_rows = non_emoji_rows.sample(needed_non_emoji_rows, random_state=42)
    final_sample = pd.concat([emoji_rows, sampled_non_emoji_rows]).sample(frac=1, random_state=42).reset_index(drop=True)
else:
    final_sample = emoji_rows.sample(50000, random_state=42).reset_index(drop=True)

print(f"Final dataset size: {final_sample.shape}")

Final dataset size: (50000, 6)


In [25]:
output_path = 'data/sentiment140_with_emojis.csv'
final_sample.to_csv(output_path, index=False, encoding='utf-8')

print(f"Saved cleaned dataset to {output_path}")

Saved cleaned dataset to data/sentiment140_with_emojis.csv


In [13]:
print(final_sample[['target', 'text']].head(20))

    target                                               text
0        4        It was a dark and stormy night...  love it!
1        4  @Reynolds_x eeeep i hope we get it  paid for h...
2        0  Feels sad my best friend Skippy is not going t...
3        4  @Majestic76 no no hun. im serious you are beau...
4        4  so iÂ´ll help my dad...spending a little bit t...
5        0  couldnt understand Terminator movie :|  bobo k...
6        4  Now I've made dinner for me and my brother  si...
7        0  My last day on the mountian and then back to c...
8        0          Goodnight tweeperzz haha 😉 Time to study 
9        0  @angelicajw i know just waitin on u to say som...
10       0  haha #PakCricket is out of trending topics again 
11       4                                          @philbee 
12       4  Reading Twilight  Yep sounds good! Sad to not ...
13       0  @GTA_Cop I agree, we wives ARE ridiculous! 😉  ...
14       0                       just woke up sooooooo tired 
15      

In [10]:
num_rows_with_emoji_final = final_sample['text'].apply(contains_emoji).sum()
print(f"Rows containing at least one emoji (final sampled): {num_rows_with_emoji_final} out of {len(final_sample)}")


Rows containing at least one emoji (final sampled): 13539 out of 50000
