In [1]:
import sys
import pandas as pd
import numpy as np
import dask.dataframe as ddf
from tqdm import tqdm
import json
import emoji
from collections import Counter

In [2]:
sys.path.append("..") # adds twitter_sentiment path

In [3]:
from twitter_sentiment.preprocessors.utils import read_jsonlines_lzma
from twitter_sentiment.preprocessors.tokenizer import tokenize

In [4]:
filepath = "../data/output/all_dedup-pt.jsonline.xz"
tweets = read_jsonlines_lzma(filepath)

In [5]:
tokenized_tweets = tokenize(tweets, lang='pt')

In [6]:
emoji_counter = Counter({k: 0 for k, v in emoji.UNICODE_EMOJI.items()})

In [7]:
for tweet in tqdm(tokenized_tweets):
    for char in tweet["treated_text"]:
        if char in emoji.UNICODE_EMOJI:
            emoji_counter[char] += 1

9195192it [15:41, 9768.82it/s] 


In [8]:
emoji_counter

Counter({'🥇': 7695,
         '🥈': 415,
         '🥉': 164,
         '🆎': 1,
         '🏧': 0,
         '🅰': 728,
         '🅰️': 0,
         '🇦🇫': 0,
         '🇦🇽': 0,
         '🇦🇱': 0,
         '🇩🇿': 0,
         '🇦🇸': 0,
         '🇦🇩': 0,
         '🇦🇴': 0,
         '🇦🇮': 0,
         '🇦🇶': 0,
         '🇦🇬': 0,
         '♒': 6,
         '🇦🇷': 0,
         '♈': 7,
         '🇦🇲': 0,
         '🇦🇼': 0,
         '🇦🇨': 0,
         '🇦🇺': 0,
         '🇦🇹': 0,
         '🇦🇿': 0,
         '🔙': 113,
         '🅱': 1021,
         '🅱️': 0,
         '🇧🇸': 0,
         '🇧🇭': 0,
         '🇧🇩': 0,
         '🇧🇧': 0,
         '🇧🇾': 0,
         '🇧🇪': 0,
         '🇧🇿': 0,
         '🇧🇯': 0,
         '🇧🇲': 0,
         '🇧🇹': 0,
         '🇧🇴': 0,
         '🇧🇦': 0,
         '🇧🇼': 0,
         '🇧🇻': 0,
         '🇧🇷': 0,
         '🇮🇴': 0,
         '🇻🇬': 0,
         '🇧🇳': 0,
         '🇧🇬': 0,
         '🇧🇫': 0,
         '🇧🇮': 0,
         '🆑': 6,
         '🆒': 28,
         '🇰🇭': 0,
         '🇨🇲': 0,
         '🇨🇦': 0,
       

In [9]:
import operator

In [10]:
freq = operator.itemgetter(1)

In [13]:
emoji_counter = sorted(emoji_counter.items(), reverse=True, key=freq)

In [16]:
emoji_counter

[('❤', 334474),
 ('😂', 303530),
 ('🖤', 216743),
 ('🔴', 195876),
 ('⚫', 192208),
 ('🤣', 133231),
 ('😍', 118862),
 ('😭', 109306),
 ('🏻', 102499),
 ('👏', 98704),
 ('♂', 76740),
 ('🏆', 72212),
 ('🙅', 62782),
 ('🇷', 57577),
 ('♥', 55871),
 ('🙏', 54481),
 ('🇧', 52193),
 ('🏽', 49831),
 ('🏼', 47612),
 ('🏾', 42661),
 ('👍', 39075),
 ('🤔', 37150),
 ('🤦', 35600),
 ('♀', 34260),
 ('🥰', 31831),
 ('🏿', 30509),
 ('⚽', 28017),
 ('🎶', 27387),
 ('😡', 23987),
 ('🙌', 22870),
 ('💪', 21439),
 ('😎', 20936),
 ('👇', 20422),
 ('🤪', 20353),
 ('🤩', 19671),
 ('🤷', 18106),
 ('👉', 17837),
 ('🥺', 15928),
 ('🙄', 15424),
 ('😔', 15071),
 ('😅', 14940),
 ('🔥', 14817),
 ('👊', 14775),
 ('✅', 14230),
 ('😁', 13620),
 ('⛓', 13215),
 ('🤘', 12277),
 ('✌', 11143),
 ('🤬', 11051),
 ('🤮', 10792),
 ('🤭', 10766),
 ('🇪', 10555),
 ('❌', 10377),
 ('😘', 9935),
 ('😱', 9932),
 ('😩', 9759),
 ('🃏', 9575),
 ('😉', 9445),
 ('😢', 9405),
 ('😠', 9203),
 ('⏰', 8884),
 ('⭐', 8746),
 ('🇺', 8708),
 ('💖', 8424),
 ('⚪', 8248),
 ('🥵', 7987),
 ('🥇', 7695),


In [22]:
df = pd.DataFrame(emoji_counter)
df.head(10)

Unnamed: 0,0,1
0,❤,334474
1,😂,303530
2,🖤,216743
3,🔴,195876
4,⚫,192208
5,🤣,133231
6,😍,118862
7,😭,109306
8,🏻,102499
9,👏,98704


In [19]:
for e, c in emoji_counter:
    print(f"{e}\t\t\t{c}")

❤			334474
😂			303530
🖤			216743
🔴			195876
⚫			192208
🤣			133231
😍			118862
😭			109306
🏻			102499
👏			98704
♂			76740
🏆			72212
🙅			62782
🇷			57577
♥			55871
🙏			54481
🇧			52193
🏽			49831
🏼			47612
🏾			42661
👍			39075
🤔			37150
🤦			35600
♀			34260
🥰			31831
🏿			30509
⚽			28017
🎶			27387
😡			23987
🙌			22870
💪			21439
😎			20936
👇			20422
🤪			20353
🤩			19671
🤷			18106
👉			17837
🥺			15928
🙄			15424
😔			15071
😅			14940
🔥			14817
👊			14775
✅			14230
😁			13620
⛓			13215
🤘			12277
✌			11143
🤬			11051
🤮			10792
🤭			10766
🇪			10555
❌			10377
😘			9935
😱			9932
😩			9759
🃏			9575
😉			9445
😢			9405
😠			9203
⏰			8884
⭐			8746
🇺			8708
💖			8424
⚪			8248
🥵			7987
🥇			7695
💕			7675
😜			7526
🖕			7469
🏴			7389
🇦			7235
🤧			7169
😬			6988
🏟			6907
✨			6891
💩			6588
😆			6516
👀			6512
✊			6502
😒			6434
🥳			6307
🇾			6259
🇹			6191
💙			6156
😝			6093
🎉			6073
👃			6050
👈			6042
👌			5993
🧔			5949
👆			5902
🖥			5806
🎵			5630
😤			5619
😏			5618
💔			5567
💛			5472
😪			5324
🇩			5263
😥			5254
💥			5132
😴			5096
💚			5076
🤢	