In [1]:
import pandas as pd
import csv


In [2]:
# delimeter is comma, but tweets have commas
# bring everything in as one column then split on first comma

xmas_tweets = pd.read_csv('HolidayTweets.csv', sep='\t', header=None)

xmas_tweets = xmas_tweets[0].str.split(',', n=1, expand=True)

xmas_tweets.columns = ['ID', 'Metadata']

xmas_tweets

Unnamed: 0,ID,Metadata
0,ID,Metadata
1,812192389460881408,"<p class=""TweetTextSize js-tweet-text tweet-t..."
2,812192389443915776,"<p class=""TweetTextSize js-tweet-text tweet-t..."
3,812192388710105089,"<p class=""TweetTextSize js-tweet-text tweet-t..."
4,812192386583592960,"<p class=""TweetTextSize js-tweet-text tweet-t..."
...,...,...
50087,813084336203452426,"<p class=""TweetTextSize js-tweet-text tweet-t..."
50088,813084336006144001,"<p class=""TweetTextSize js-tweet-text tweet-t..."
50089,813084330687918080,"<p class=""TweetTextSize js-tweet-text tweet-t..."
50090,813084330608230400,"<p class=""TweetTextSize js-tweet-text tweet-t..."


In [3]:
xmas_tweets['hashtags'] = xmas_tweets['Metadata'].str.extractall(r'(#\w+)').groupby(level=0).agg(','.join)
xmas_tweets['hashtags']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
        ... 
50087    NaN
50088    NaN
50089    NaN
50090    NaN
50091    NaN
Name: hashtags, Length: 50092, dtype: object

In [4]:
xmas_tweets[xmas_tweets['hashtags'].notna()]

Unnamed: 0,ID,Metadata,hashtags
318,812193835925340160,"<p class=""TweetTextSize js-tweet-text tweet-t...",#2017
326,812193947241979904,"<p class=""TweetTextSize js-tweet-text tweet-t...","#wedding,#weddingdress"
342,812194056415612928,"<p class=""TweetTextSize js-tweet-text tweet-t...",#2017
359,812194134257565697,"<p class=""TweetTextSize js-tweet-text tweet-t...",#2017
447,812194642489995264,"<p class=""TweetTextSize js-tweet-text tweet-t...",#クリスマスマーケット
...,...,...,...
49842,813083137060184065,"<p class=""TweetTextSize js-tweet-text tweet-t...",#2016
49857,813083118253080578,"<p class=""TweetTextSize js-tweet-text tweet-t...",#2016
49903,813083287967191041,"<p class=""TweetTextSize js-tweet-text tweet-t...",#2017
49984,813083603311554560,"<p class=""TweetTextSize js-tweet-text tweet-t...",#78


In [5]:
xmas_tweets

Unnamed: 0,ID,Metadata,hashtags
0,ID,Metadata,
1,812192389460881408,"<p class=""TweetTextSize js-tweet-text tweet-t...",
2,812192389443915776,"<p class=""TweetTextSize js-tweet-text tweet-t...",
3,812192388710105089,"<p class=""TweetTextSize js-tweet-text tweet-t...",
4,812192386583592960,"<p class=""TweetTextSize js-tweet-text tweet-t...",
...,...,...,...
50087,813084336203452426,"<p class=""TweetTextSize js-tweet-text tweet-t...",
50088,813084336006144001,"<p class=""TweetTextSize js-tweet-text tweet-t...",
50089,813084330687918080,"<p class=""TweetTextSize js-tweet-text tweet-t...",
50090,813084330608230400,"<p class=""TweetTextSize js-tweet-text tweet-t...",


In [6]:
# Extract language
xmas_tweets['language'] = xmas_tweets['Metadata'].str.extract(r'lang="(\w+)"')

In [7]:
# Extract country from hashtags
xmas_tweets['country'] = xmas_tweets['hashtags'].str.extract(r'#(\w+)')

In [8]:
# Extract tweet content
xmas_tweets['tweet_content'] = xmas_tweets['Metadata'].str.extract(r'<p.*?>(.*?)</p>')

In [9]:
# Extract emojis
xmas_tweets['emojis'] = xmas_tweets['tweet_content'].str.extractall(r'(<img .*?alt="([^"]+)"[^>]*>)').groupby(level=0)[1].agg(','.join)

In [10]:
xmas_tweets

Unnamed: 0,ID,Metadata,hashtags,language,country,tweet_content,emojis
0,ID,Metadata,,,,,
1,812192389460881408,"<p class=""TweetTextSize js-tweet-text tweet-t...",,en,,"A <a href=""/hashtag/ShoutOut?src=hash"" data-qu...",
2,812192389443915776,"<p class=""TweetTextSize js-tweet-text tweet-t...",,en,,Oh good lord! 2016 lays another boot in - conj...,
3,812192388710105089,"<p class=""TweetTextSize js-tweet-text tweet-t...",,en,,Lookin' for the purrfect one for me <img class...,"😻,🎄,❤️,☃,❄,🌟,🎅🏻,🎁,🍭,🔔,🎄,☃,❄,💝"
4,812192386583592960,"<p class=""TweetTextSize js-tweet-text tweet-t...",,en,,"<a href=""/hashtag/ff?src=hash"" data-query-sour...",
...,...,...,...,...,...,...,...
50087,813084336203452426,"<p class=""TweetTextSize js-tweet-text tweet-t...",,en,,"<span class=""twitter-hashflag-container""><a hr...",
50088,813084336006144001,"<p class=""TweetTextSize js-tweet-text tweet-t...",,en,,Hope everyone is having a wonderful <span clas...,
50089,813084330687918080,"<p class=""TweetTextSize js-tweet-text tweet-t...",,en,,Wearing 3 layers of highlighter today because ...,
50090,813084330608230400,"<p class=""TweetTextSize js-tweet-text tweet-t...",,et,,Merry Christmas &amp; Happy Chanukah!! <img cl...,"🎄,🎁"


In [15]:
# Extract unique emojis
unique_emojis = set()
for emojis_str in xmas_tweets['emojis'].dropna():
    emojis = emojis_str.split(',')
    unique_emojis.update(emojis)
    
len(unique_emojis)

1014

In [17]:
from collections import defaultdict


# Create a dictionary to store emojis grouped by emotion
emojis_by_emotion = defaultdict(list)

# Extract and group emojis by emotion
for emojis_str in xmas_tweets['emojis'].dropna():
    emojis = emojis_str.split(',')
    for emoji in emojis:
        # Assuming the emojis are associated with emotions, modify this logic based on your data
        # You might need to use a more sophisticated approach or an external library for emotion analysis
        # For simplicity, let's assume the emotion is the first character of the emoji
        emotion = emoji[0]
        emojis_by_emotion[emotion].append(emoji)

# Print emojis grouped by emotion
for emotion, emojis in emojis_by_emotion.items():
    print(f"{emotion} Emotion: {emojis}")

😻 Emotion: ['😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻']
🎄 Emotion: ['🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '

In [20]:
emoji_counts = {emotion: len(emojis) for emotion, emojis in emojis_by_emotion.items()}

# a list of emojis and their counts in descending order
emoji_counts_desc = sorted(emoji_counts.items(), key=lambda x: x[1], reverse=True)
emoji_counts_desc

[('🎄', 6700),
 ('🎅', 3061),
 ('🎁', 2850),
 ('❤', 1829),
 ('😂', 815),
 ('❄', 787),
 ('😍', 727),
 ('🎉', 701),
 ('✨', 637),
 ('😊', 467),
 ('☃', 463),
 ('🌲', 399),
 ('🌟', 389),
 ('⛄', 370),
 ('😘', 366),
 ('💕', 354),
 ('💩', 241),
 ('🙏', 210),
 ('🍾', 209),
 ('💚', 198),
 ('🍷', 184),
 ('🎶', 182),
 ('😁', 180),
 ('🎊', 167),
 ('💖', 167),
 ('🔥', 163),
 ('👍', 154),
 ('🙌', 140),
 ('😉', 140),
 ('😀', 134),
 ('👌', 133),
 ('☺', 129),
 ('😄', 129),
 ('💙', 127),
 ('💋', 125),
 ('♥', 122),
 ('⭐', 118),
 ('🐶', 117),
 ('😎', 116),
 ('😃', 113),
 ('😋', 112),
 ('💜', 112),
 ('🎂', 110),
 ('🤶', 107),
 ('✌', 107),
 ('🤗', 105),
 ('💝', 104),
 ('🎀', 101),
 ('💗', 97),
 ('😆', 94),
 ('😜', 94),
 ('😻', 92),
 ('🎆', 90),
 ('😇', 88),
 ('🍻', 87),
 ('👏', 87),
 ('🔔', 86),
 ('🍪', 83),
 ('💫', 81),
 ('😭', 74),
 ('💞', 74),
 ('💓', 72),
 ('🥂', 70),
 ('💛', 68),
 ('🎈', 68),
 ('💃', 66),
 ('🍭', 65),
 ('🎵', 61),
 ('🤔', 56),
 ('🍸', 56),
 ('💪', 56),
 ('🍺', 55),
 ('📷', 54),
 ('🛍', 53),
 ('😙', 53),
 ('✔', 52),
 ('🙈', 51),
 ('🇺', 51),
 ('😳', 51),


In [22]:
print("Top 5 Emojis:")
for emoji, count in emoji_counts_desc[:5]:
    print(f"{emoji}: {count}")

# Print the bottom 5 emojis
print("\nBottom 5 Emojis:")
for emoji, count in emoji_counts_desc[-5:]:
    print(f"{emoji}: {count}")

Top 5 Emojis:
🎄: 6700
🎅: 3061
🎁: 2850
❤: 1829
😂: 815

Bottom 5 Emojis:
⚱: 1
🏴: 1
➖: 1
⏩: 1
⏪: 1
