# Data Crawler for Video Games

In [15]:
# get libraries
import pandas as pd
import snscrape.modules.twitter as sntwitter
import itertools
import time
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from cleantext import clean
from tqdm.notebook import tqdm

In [3]:
# Python Dictionary of Countries mapped to their Continents
countriesDict = {
    "India":"Asia","Singapore":"Asia","South Korea":"Asia",
    "United States":"North America","Mexico":"North America", "Canada":"North America", 
    "Brazil":"South America", "Argentina":"South Amercia", "Uruguay": "South America",
    "Ireland":"Europe","Argentina":"Europe","Germany":"Europe","United Kingdom":"Europe",
    "Italy":"Europe","Spain":"Europe","Switzerland":"Europe", "France":"Europe","Norway": "Europe",
    "Sweden": "Europe","Denmark": "Europe","Belgium":"Europe","Portugal":"Europe",
    "Netherlands": "Europe", "Croatia": "Europe",
    "South Africa":"Africa","Egypt":"Africa","Nigeria": "Africa",
    "Australia":"Australia","New Zealand":"Australia"
}

num_tweets_per_tag = 1000

In [52]:
# Keywords
keywords = [ 
    # hashtags
    '#HogwartsLegacy','#hogwartslegacy','#HogwartsLegacyGAME','#hogwartslegacygame',
    '#HogwartLegacy','#harrypottergame','#HogwartsLegacyGame','#hogwartslegacydrops',
    '#HOGWARTSLEGACY'
]

users = {
    # users
    'HogwartsLegacy', 'Hogwarts_Legacy', 'HogwartsLegacy_', 'HogLegNews'
}

In [68]:
def scrape_data(num_tweets_per_tag=100):
    start = time.time()
    df = pd.DataFrame()
    for word in keywords:
        try:
            print(f"Scraping for keyword: {word}")
            df = df.append(pd.DataFrame(itertools.islice(tqdm(sntwitter.TwitterSearchScraper(
                f'{word} lang:en since:2023-02-10 until:2023-02-19').get_items()), num_tweets_per_tag)))
        except Exception as e:
            print(f"An error occured: :(\n")
            continue
    
    for user in users:
        try:
            print(f"Scraping for user: {user}")
            df = df.append(pd.DataFrame(itertools.islice(tqdm(sntwitter.TwitterSearchScraper(
                f'from:{user} lang:en since:2023-02-10 until:2023-02-19').get_items()), num_tweets_per_tag)))
        except Exception as e:
            print(f"An error occured: :(\n")
            continue
    # if len(df) < 150:
    #     print(f"Number of tweets for {countryName} is lower than expected! df shape: {df.shape}")
    df['username'] =  df['user'].apply(lambda x: x['username'])
    df['verified'] = df['user'].apply(lambda x:x['verified'])
    df['followersCount'] = df['user'].apply(lambda x:x['followersCount'])
    df_ = df[["username","verified","followersCount", "rawContent", "date", "replyCount", "retweetCount", "likeCount", "url", "hashtags"]]
    print(f"Shape of df {df_.shape}, Time taken: {((time.time() - start)/60):.1f} mins")
    return df_

In [92]:
df = scrape_data(10000)

Scraping for keyword: #HogwartsLegacy


0it [00:00, ?it/s]

Scraping for keyword: #hogwartslegacy


0it [00:00, ?it/s]

Scraping for keyword: #HogwartsLegacyGAME


0it [00:00, ?it/s]

Scraping for keyword: #hogwartslegacygame


0it [00:00, ?it/s]

Scraping for keyword: #HogwartLegacy


0it [00:00, ?it/s]

Scraping for keyword: #harrypottergame


0it [00:00, ?it/s]

Scraping for keyword: #HogwartsLegacyGame


0it [00:00, ?it/s]

Scraping for keyword: #hogwartslegacydrops


0it [00:00, ?it/s]

Scraping for keyword: #HOGWARTSLEGACY


0it [00:00, ?it/s]

Scraping for user: HogLegNews


0it [00:00, ?it/s]

Scraping for user: HogwartsLegacy_


0it [00:00, ?it/s]

Scraping for user: Hogwarts_Legacy


0it [00:00, ?it/s]

Scraping for user: HogwartsLegacy


0it [00:00, ?it/s]

Shape of df (48407, 10), Time taken: 41.0 mins


In [93]:
df

Unnamed: 0,username,verified,followersCount,rawContent,date,replyCount,retweetCount,likeCount,url,hashtags
0,Slev_86,False,1971,Will be #streaming tomorrow around 10 AM EST o...,2023-02-18 23:58:28+00:00,1,1,16,https://twitter.com/Slev_86/status/16270952344...,"[streaming, HogwartsLegacy]"
1,RasmusVesik,False,27,I really love this piece of art from #Hogwarts...,2023-02-18 23:57:27+00:00,0,0,0,https://twitter.com/RasmusVesik/status/1627094...,[HogwartsLegacy]
2,ETalkUK,False,768,#HogwartsLegacy And #TheWorldCup Vs The World ...,2023-02-18 23:56:29+00:00,0,0,0,https://twitter.com/ETalkUK/status/16270947360...,"[HogwartsLegacy, TheWorldCup, VideoGames, Podc..."
3,KaleLikeTheLeaf,False,263,#HogwartsLegacy Ravenclaw Playthrough is Now L...,2023-02-18 23:55:54+00:00,0,0,6,https://twitter.com/KaleLikeTheLeaf/status/162...,[HogwartsLegacy]
4,BeamtupG,False,22,Continuing my (evil) exploration through the #...,2023-02-18 23:54:26+00:00,0,0,0,https://twitter.com/BeamtupG/status/1627094219...,"[WizardingWorld, HogwartsLegacy, pc, streaming..."
...,...,...,...,...,...,...,...,...,...,...
44,HogwartsLegacy,True,490274,Be the envy of all your classmates with these ...,2023-02-10 20:01:36+00:00,228,222,2721,https://twitter.com/HogwartsLegacy/status/1624...,[HogwartsLegacy]
45,HogwartsLegacy,True,490274,The magic is finally here. What kind of Legacy...,2023-02-10 18:00:45+00:00,545,1036,11478,https://twitter.com/HogwartsLegacy/status/1624...,[HogwartsLegacy]
46,HogwartsLegacy,True,490274,Don't miss your last chance to don Merlin's Cl...,2023-02-10 16:15:29+00:00,560,405,3830,https://twitter.com/HogwartsLegacy/status/1624...,[HogwartsLegacy]
47,HogwartsLegacy,True,490274,Don't want to watch on Twitch? You can also ca...,2023-02-10 15:55:59+00:00,17,34,439,https://twitter.com/HogwartsLegacy/status/1624...,


In [94]:
hashtags = []
for i in df['hashtags']:
    try:
        for j in i:
            if j not in hashtags:
                hashtags.append(j)
    except:
        continue

usernames = []
for i in df['username']:
    try:
        if i not in usernames:
            usernames.append(i)
    except:
        continue
# print(usernames)

In [95]:
print([x for x in hashtags if 'hogwarts' in x.lower()])
print([x for x in usernames if 'hogwart' in x.lower()])
# hashtags = ['HogwartsLegacy','hogwartslegacy','HogwartsLegacyGAME','hogwartslegacygame',
#             'HogwartLegacy','harrypottergame','HogwartsLegacyGame','hogwartslegacydrops',
#             'HOGWARTSLEGACY']

['HogwartsLegacy', 'HOGWARTSLEGACY', 'HogwartsLegacyGAME', 'Hogwarts', 'hogwartslegacy', 'hogwartsismyhome', 'Hogwartslegacy', 'Hogwartschallange', 'ClownPrinceOfHogwarts', 'hogwarts', 'hogwartsexpress', 'hogwartslegacygame', 'hogwartslegacygameplay', 'HogwartsLegacyboycott', 'HogwartsLegacyPs5', 'HogwartsLegacydrops', 'HOGWARTS', 'HogwartsLegacyPatrocinado', 'hogwartsLegacy', 'HogwartsLegacyGame', 'HogwartsLegacypc', 'HogwartsGame', 'HogwartsCastle', 'HogwartsLegacyVP', 'HogwartsGameplay', 'HogwartsLegacyPatrocinad', 'Hogwartslegacygame', 'HogwartsLegacyDemo', 'HogwartsLegacyLHéritagedePoudlard', 'HogwartsLegacyPC', 'HogwartsHouse', 'HogwartsHouses', 'hogwartslegacymeme', 'hogwartshouses', 'hogwartsprofessors', 'HogwartsLegacyphotography', 'HogwartsMystery', 'HogwartsLegacyfr', 'hogwartsgame', 'HogwartsLegacyLive', 'BoycottHogwartsLegacy', 'HogwartsLegacyBoycott', 'hogwartslegacythegame', 'hogwartslegacycommunity', 'hogwartslegacyfan', 'hogwartsgamer', 'hogwartsschool', 'BackToHogwart

In [96]:
# Cleaning Data
df_indexes_v2 = []
user_dict = {}
for i in range(len(df)):
    tweet = df["rawContent"].iloc[i]
    
    # To remove tweets that have more hashtags than normal text
    word_list = tweet.lower().split()
    num_normal = 0
    num_tags = 0
    for j in range(len(word_list)):
        temp = word_list[j]
        if temp[0] == '#':
            num_tags += 1
        else:
            num_normal += 1
    if num_tags > num_normal:
        continue
print(f'Shape of df after cleaning: {df.shape}')

Shape of df after cleaning: (48407, 10)


In [97]:
df_v2 = df

In [90]:
print(df_v2)

           username  verified  followersCount  \
0           Slev_86     False            1971   
1       RasmusVesik     False              27   
2           ETalkUK     False             768   
3   KaleLikeTheLeaf     False             263   
4          BeamtupG     False              22   
..              ...       ...             ...   
44   HogwartsLegacy      True          490235   
45   HogwartsLegacy      True          490235   
46   HogwartsLegacy      True          490235   
47   HogwartsLegacy      True          490235   
48   HogwartsLegacy      True          490235   

                                           rawContent  \
0   Will be #streaming tomorrow around 10 AM EST o...   
1   I really love this piece of art from #Hogwarts...   
2   #HogwartsLegacy And #TheWorldCup Vs The World ...   
3   #HogwartsLegacy Ravenclaw Playthrough is Now L...   
4   Continuing my (evil) exploration through the #...   
..                                                ...   
44  Be the e

In [100]:
# Shuffling tweets in version 2 of the dataframe, and saving to a CSV file
df_v2 = df_v2.drop_duplicates(subset='rawContent')
df_v2 = df_v2.sample(frac=1).reset_index(drop=True)
df_v2['date'] = df_v2['date'].dt.tz_localize(None)
print(df_v2.shape)
# df_v2.to_csv("HGL_crawled.csv", index=False)

(21686, 10)


## Data Preprocessing

# download wordlists for nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'  

# text = re.sub(emoji.get_emoji_regexp(), r"", text)

def remove_links(tweet):
    """Takes a string and removes web links from it"""
    tweet = re.sub(r'http\S+', '', tweet)   # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet)  # remove bitly links
    tweet = tweet.strip('[link]')   # remove [links]
    tweet = re.sub(r'pic.twitter\S+','', tweet)
    return tweet

def remove_users(tweet):
    """Takes a string and removes retweet and @user information"""
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove re-tweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove tweeted at
    return tweet

def remove_hashtags(tweet):
    """Takes a string and removes any hash tags"""
    tweet = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet)  # remove hash tags
    return tweet

def remove_av(tweet):
    """Takes a string and removes AUDIO/VIDEO tags or labels"""
    tweet = re.sub('VIDEO:', '', tweet)  # remove 'VIDEO:' from start of tweet
    tweet = re.sub('AUDIO:', '', tweet)  # remove 'AUDIO:' from start of tweet
    return tweet

def tokenize(tweet):
    """Returns tokenized representation of words in lemma form excluding stopwords"""
    result = []
    
    tweet = clean(tweet, no_emoji=True)
    for token in word_tokenize(tweet):
        if token not in stopwords.words('english') \
                and len(token) > 2:  # drops words with less than 3 characters
            result.append(lemmatize(token))
    return result

def lemmatize(token):
    """Returns lemmatization of a token"""
    return WordNetLemmatizer().lemmatize(token, pos='v')

def preprocess_tweet(tweet):
    """Main master function to clean tweets, stripping noisy characters, and tokenizing use lemmatization"""
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = remove_hashtags(tweet)
    tweet = remove_av(tweet)
    tweet = tweet.lower()  # lower case
    tweet = re.sub('[' + punctuation + ']+', ' ', tweet)  # strip punctuation
    tweet = re.sub('\s+', ' ', tweet)  # remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet)  # remove numbers
    tweet_token_list = tokenize(tweet)  # apply lemmatization and tokenization
    tweet = ' '.join(tweet_token_list)
    return tweet

In [None]:
df['cleaned_text'] = df['content'].apply(preprocess_tweet)