In [1]:
import pandas as pd
import csv
from bs4 import BeautifulSoup
import re
import emoji

In [2]:
# delimeter is comma, but tweets have commas
# bring everything in as one column then split on first comma

xmas_tweets = pd.read_csv('HolidayTweets.csv', sep='\t', header=None, skiprows=1)

xmas_tweets = xmas_tweets[0].str.split(',', n=1, expand=True)

xmas_tweets.columns = ['ID', 'Metadata']

xmas_tweets

Unnamed: 0,ID,Metadata
0,812192389460881408,"<p class=""TweetTextSize js-tweet-text tweet-t..."
1,812192389443915776,"<p class=""TweetTextSize js-tweet-text tweet-t..."
2,812192388710105089,"<p class=""TweetTextSize js-tweet-text tweet-t..."
3,812192386583592960,"<p class=""TweetTextSize js-tweet-text tweet-t..."
4,812192386352877568,"<p class=""TweetTextSize js-tweet-text tweet-t..."
...,...,...
50086,813084336203452426,"<p class=""TweetTextSize js-tweet-text tweet-t..."
50087,813084336006144001,"<p class=""TweetTextSize js-tweet-text tweet-t..."
50088,813084330687918080,"<p class=""TweetTextSize js-tweet-text tweet-t..."
50089,813084330608230400,"<p class=""TweetTextSize js-tweet-text tweet-t..."


In [3]:
def extract_text(html_content):
    if html_content is not None:
        soup = BeautifulSoup(html_content, 'html.parser')
        return soup.get_text()
    else:
        return ''

# extracts tweet from metadata
xmas_tweets['tweet'] = xmas_tweets['Metadata'].apply(extract_text)

# extracts hashstags from tweets
xmas_tweets['hashtag'] = xmas_tweets['tweet'].apply(lambda x: re.findall(r'\#\w+', x))

# extract language from metadata
xmas_tweets['language'] = xmas_tweets['Metadata'].str.extract(r'lang="(\w+)"')

# mapping language abbreviations with full names
language_mapping = {
    'en': 'English',
    'it': 'Italian',
    'und': 'Undefined',
    'ja': 'Japanese',
    'fr': 'French',
    'et': 'Estonian',
    'ro': 'Romanian',
    'pl': 'Polish',
    'hi': 'Hindi',
    'ru': 'Russian',
    'de': 'German',
    'es': 'Spanish',
    'nl': 'Dutch',
    'lt': 'Lithuanian',
    'el': 'Greek',
    'pt': 'Portuguese',
    'ko': 'Korean',
    'ta': 'Tamil',
    'in': 'Indonesian',
    'no': 'Norwegian',
    'tr': 'Turkish',
    'tl': 'Tagalog',
    'sv': 'Swedish',
    'ht': 'Haitian Creole',
    'fi': 'Finnish',
    'cs': 'Czech',
    'cy': 'Welsh',
    'uk': 'Ukrainian',
    'zh': 'Chinese',
    'hu': 'Hungarian',
    'da': 'Danish',
    'th': 'Thai',
    'sl': 'Slovenian',
    'eu': 'Basque',
    'ar': 'Arabic',
    'lv': 'Latvian',
    'is': 'Icelandic',
    'ml': 'Malayalam',
    'vi': 'Vietnamese',
    'bg': 'Bulgarian',
    'te': 'Telugu',
    'ne': 'Nepali',
    'kn': 'Kannada',
    'si': 'Sinhala'
}

# replace language abreviations with full names
xmas_tweets['language'] = xmas_tweets['language'].replace(language_mapping)

# extract emojis from metadata
xmas_tweets['emoji'] = xmas_tweets['Metadata'].str.extractall(r'(<img .*?alt="([^"]+)"[^>]*>)').groupby(level=0)[1].agg(','.join)

In [4]:
xmas_tweets

Unnamed: 0,ID,Metadata,tweet,hashtag,language,emoji
0,812192389460881408,"<p class=""TweetTextSize js-tweet-text tweet-t...",A #ShoutOut for all the Customer Service staff...,"[#ShoutOut, #Christmas, #WhoIsYourSanta]",English,
1,812192389443915776,"<p class=""TweetTextSize js-tweet-text tweet-t...",Oh good lord! 2016 lays another boot in - conj...,"[#conjunctivitis, #Christmas]",English,
2,812192388710105089,"<p class=""TweetTextSize js-tweet-text tweet-t...",Lookin' for the purrfect one for me #Christma...,"[#Christmas, #Elverojaguar]",English,"😻,🎄,❤️,☃,❄,🌟,🎅🏻,🎁,🍭,🔔,🎄,☃,❄,💝"
3,812192386583592960,"<p class=""TweetTextSize js-tweet-text tweet-t...",#ff #FollowFriday @Worcester_MINI @Warwick_BMW...,"[#ff, #FollowFriday, #Christmas, #BMW]",English,
4,812192386352877568,"<p class=""TweetTextSize js-tweet-text tweet-t...","I will honor #Christmas in my heart, and try t...","[#Christmas, #Dickens]",English,
...,...,...,...,...,...,...
50086,813084336203452426,"<p class=""TweetTextSize js-tweet-text tweet-t...",#Christmas so far has been nice. Good food. Mo...,[#Christmas],English,
50087,813084336006144001,"<p class=""TweetTextSize js-tweet-text tweet-t...",Hope everyone is having a wonderful #Christmas...,[#Christmas],English,
50088,813084330687918080,"<p class=""TweetTextSize js-tweet-text tweet-t...",Wearing 3 layers of highlighter today because ...,"[#Christmas, #extra]",English,
50089,813084330608230400,"<p class=""TweetTextSize js-tweet-text tweet-t...",Merry Christmas & Happy Chanukah!! #Christmas...,"[#Christmas, #merrychristmas, #christmascheer,...",Estonian,"🎄,🎁"


In [5]:
# Extract unique emojis
unique_emojis = set()
for emojis_str in xmas_tweets['emoji'].dropna():
    emojis = emojis_str.split(',')
    unique_emojis.update(emojis)
    
print(len(unique_emojis))
print(unique_emojis)

1014
{'👩🏿', '🇦🇪', '⏰', '🐫', '🏁', '⛪', '🐨', '🏙', '🏛', '💗', '🐒', '🐻', '🐎', '📹', '👰🏽', '👵🏼', '♣️', '⚪', '🚁', '🤗', '🏀', '🍂', '🐊', '#️⃣', '📣', '☃️', '📽️', '🎸', '✝️', '👨\u200d👧', '🌹', '🏳️\u200d🌈', '👋🏾', '🔵', '🛍', '😍', '👏🏼', '🐺', '😚', '⭐️', '🥛', '🏫', '✋', '💐', '💵', '🖨', '🤘🏽', '🔛', '🎅🏻', '🍴', '😝', '🐓', '🙇🏼\u200d♀️', '👍🏾', '🐸', '🤘🏻', '🚊', '▶️', '🚺', '🌏', '🌩', '🤛🏻', '🌲', '🍇', '💽', '💎', '🎁', '😳', '🙁', '💓', '🐿', '😆', '🐭', '🐥', '🇨', '🎚', '💨', '🎹', '📞', '🏃🏿\u200d♀️', '👚', '😒', '😣', '🙎', '🍹', '🚂', '🌱', '👨\u200d👩\u200d👧\u200d👦', '🚴🏻\u200d♀️', '👊🏻', '🐟', '🦌', '👒', '👌', '🌇', '🍨', '🐉', '🎽', '🇻🇪', '🌜', '🐱', '👋🏿', '🌤', '🐑', '🔪', '🔑', '🤶🏿', '💩', '😊', '🎷', '👪', '💷', '🍏', '💘', '🥐', '🇨🇽', '👍🏻', '🗾', '🏔', '🇺🇦', '🙄', '🥘', '🍘', '💪', '👜', '🌂', '👧🏻', '💮', '😮', '🍿', '🍉', '🐇', '🙅', '🇵🇸', '⬆️', '🍒', '🚍', '💒', '🏆', '🌸', '♑️', '🇸🇾', '👬', '🏄🏽', '👱🏻', '✂️', '🇰🇷', '🕒', '🥓', '🛏', '✴', '✉️', '🇧🇷', '🅾️', '👞', '✋🏽', '🐖', '🎡', '👩\u200d👩\u200d👧', '🌥', '⚡️', '🇵🇫', '🗜', '👩🏾', '🔐', '👌🏾', '🌦', '😓', '🎶', '🕺🏻', '👨\u200d👩\u200d👦\u200d👦

In [6]:
from collections import defaultdict


# Create a dictionary to store emojis grouped by emotion
emojis_by_emotion = defaultdict(list)

# Extract and group emojis by emotion
for emojis_str in xmas_tweets['emoji'].dropna():
    emojis = emojis_str.split(',')
    for emoji in emojis:
        # For simplicity, let's assume the emotion is the first character of the emoji
        # may come back to this
        emotion = emoji[0]
        emojis_by_emotion[emotion].append(emoji)

# Print emojis grouped by emotion
for emotion, emojis in emojis_by_emotion.items():
    print(f"{emotion} Emotion: {emojis}")

😻 Emotion: ['😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻', '😻']
🎄 Emotion: ['🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '🎄', '

In [7]:
emoji_counts = {emotion: len(emoji) for emotion, emoji in emojis_by_emotion.items()}

# a list of emojis and their counts in descending order
emoji_counts_desc = sorted(emoji_counts.items(), key=lambda x: x[1], reverse=True)
emoji_counts_desc

[('🎄', 6702),
 ('🎅', 3062),
 ('🎁', 2851),
 ('❤', 1829),
 ('😂', 815),
 ('❄', 787),
 ('😍', 727),
 ('🎉', 702),
 ('✨', 637),
 ('😊', 467),
 ('☃', 464),
 ('🌲', 399),
 ('🌟', 389),
 ('⛄', 370),
 ('😘', 366),
 ('💕', 354),
 ('💩', 241),
 ('🙏', 210),
 ('🍾', 209),
 ('💚', 198),
 ('🍷', 184),
 ('🎶', 182),
 ('😁', 180),
 ('🎊', 167),
 ('💖', 167),
 ('🔥', 163),
 ('👍', 154),
 ('🙌', 140),
 ('😉', 140),
 ('😀', 134),
 ('👌', 133),
 ('☺', 129),
 ('😄', 129),
 ('💙', 127),
 ('💋', 125),
 ('♥', 122),
 ('⭐', 118),
 ('🐶', 117),
 ('😎', 116),
 ('😃', 113),
 ('😋', 112),
 ('💜', 112),
 ('🎂', 110),
 ('🤶', 107),
 ('✌', 107),
 ('🤗', 105),
 ('💝', 104),
 ('🎀', 101),
 ('💗', 97),
 ('😆', 94),
 ('😜', 94),
 ('😻', 92),
 ('🎆', 90),
 ('😇', 88),
 ('🍻', 87),
 ('👏', 87),
 ('🔔', 86),
 ('🍪', 83),
 ('💫', 81),
 ('😭', 74),
 ('💞', 74),
 ('💓', 72),
 ('🥂', 70),
 ('💛', 68),
 ('🎈', 68),
 ('💃', 66),
 ('🍭', 65),
 ('🎵', 61),
 ('🤔', 56),
 ('🍸', 56),
 ('💪', 56),
 ('🍺', 55),
 ('📷', 54),
 ('🛍', 53),
 ('😙', 53),
 ('✔', 52),
 ('🙈', 51),
 ('🇺', 51),
 ('😳', 51),


In [8]:
print("Top 5 Emojis:")
for emoji, count in emoji_counts_desc[:5]:
    print(f"{emoji}: {count}")

# Print the bottom 5 emojis
print("\nBottom 5 Emojis:")
for emoji, count in emoji_counts_desc[-5:]:
    print(f"{emoji}: {count}")

Top 5 Emojis:
🎄: 6702
🎅: 3062
🎁: 2851
❤: 1829
😂: 815

Bottom 5 Emojis:
⚱: 1
🏴: 1
➖: 1
⏩: 1
⏪: 1


In [9]:
#playing with language proccessor

from textblob import TextBlob

sentence = "I love using emojis!"
blob = TextBlob(sentence)

# sentiment polarity (positive values indicate positive sentiment, negative values indicate negative sentiment)
sentiment_polarity = blob.sentiment.polarity

print(f"Sentiment Polarity: {sentiment_polarity}")

# subjectivity (0 is objective, 1 is subjective)
sentiment_subjectivity = blob.sentiment.subjectivity

print(f"Sentiment Subjectivity: {sentiment_subjectivity}")

Sentiment Polarity: 0.625
Sentiment Subjectivity: 0.6
