In [1]:
# 10,000 strings. 1,580 contain emoji
with open('random_10k.txt', 'r', encoding='utf-8') as f:
    random_10k = f.read().splitlines()

# 10,000 strings. All 10,000 contain emoji
with open('emojis_10k.txt', 'r', encoding='utf-8') as f:
    emojis_10k = f.read().splitlines()

In [2]:
from extract import Extractor

extract = Extractor()

# Usage 1
## Pass in strings without knowing beforehand if they contain emoji

With ```check_first=True```, determine if there are actually any emoji present to count.

If there are, then count them. If not, then return an empty dictionary - which would have been the result anyway.

In [3]:
for t in random_10k[0:10]:
    print(extract.count_emoji(t, check_first=True))

Counter()
Counter({'👀': 1})
Counter({'🌟': 1, '💕': 1})
Counter()
Counter()
Counter()
Counter()
Counter({'⏰': 1, '🥅': 1, '🆚': 1, '🏆': 1})
Counter()
Counter()


A collections.Counter object is returned. If no emoji were counted, then the Counter will be empty.

# Usage 2

## Pass in strings that you already know contain emoji

With ```check_first=False```, just assume that there will be emoji present.

Perhaps because you've already filtered the data.

In [4]:
for t in emojis_10k[0:10]:   
    print(extract.count_emoji(t, check_first=False))

Counter({'😇': 2, '💦': 2})
Counter({'😭': 1})
Counter({'🛫': 1, '🏆': 1})
Counter({'🍎': 1})
Counter({'💁🏻': 1})
Counter({'🔴': 1})
Counter({'☺': 1})
Counter({'😿': 1, '💛': 1})
Counter({'😿': 1, '💛': 1})
Counter({'😿': 1, '💛': 1})


# Usage 3
## Pass an iterable of strings

Counters have a useful ```most_common``` method.

In [5]:
count = extract.count_all_emoji(emojis_10k[0:10])

count.most_common()

[('😿', 3),
 ('💛', 3),
 ('😇', 2),
 ('💦', 2),
 ('😭', 1),
 ('🛫', 1),
 ('🏆', 1),
 ('🍎', 1),
 ('💁🏻', 1),
 ('🔴', 1),
 ('☺', 1)]

In [6]:
count2 = extract.count_all_emoji(emojis_10k)

count2.most_common(n=10)

[('😂', 2813),
 ('❤', 1150),
 ('😍', 974),
 ('😭', 933),
 ('💕', 552),
 ('🔥', 485),
 ('✨', 430),
 ('♥', 286),
 ('😊', 277),
 ('😘', 236)]

# Speed comparison
When you're not sure if you have emoji in all strings, it's obviously faster to check first before trying to count since counting involves a lot of searches.

Example of how much slower it is without checking first:

In [7]:
%%timeit
for t in random_10k:
    extract.count_emoji(t, check_first=False)

1 loop, best of 3: 1.72 s per loop


And with checking, where you only run the counter for 15% of the strings:

In [8]:
%%timeit
for t in random_10k:
    extract.count_emoji(t, check_first=True)

1 loop, best of 3: 357 ms per loop


If you already know that every string has emoji, then disabling checking gives a very minor speedup:

In [9]:
%%timeit
for t in emojis_10k:   
    extract.count_emoji(t, check_first=False)

1 loop, best of 3: 1.8 s per loop


In [10]:
%%timeit
for t in emojis_10k:   
    extract.count_emoji(t, check_first=True)

1 loop, best of 3: 1.85 s per loop
