# EPL_Tweets_Scraper
- A Script which can get multiple queries of tweets from Twitter. 
- We use this script to collect tweets from Arsenal and Tottenham for 380 games spread across 5 different seasons. 

In [1]:
## Import Required Packages.
import snscrape.modules.twitter as sntwitter
import pandas as pd

### Load our Team_Stats Dataframe containing information on which day a team played and if they won. 
team_stats = pd.read_csv(r'..\Data\Team_Stats.csv')


## Formatting Date
team_stats['Date'] = pd.to_datetime(team_stats['Date'])
team_stats['Date'] = pd.to_datetime(team_stats['Date'], format = '%yyyy-%mm-%dd')
team_stats['Date'] = team_stats['Date'].apply(str)
team_stats['Date'] = team_stats['Date'].str[:-9]

### Creating combined query column and converting to list. 
team_stats['combined'] = team_stats[['Team','Date']].values.tolist()
query_list = list(team_stats['combined'])

## Defining get_all_queries
def get_queries():
    '''Returns a list of queries we want to run in the Twitter Scraper.'''
    queries = []
    for value in query_list:
        query = "{} until:{}".format(value[0],value[1])
        queries.append(query)
    return queries

## Defining our get_tweets function
def get_tweets(query,limit =100): 
    '''Function returns a list of tweets'''
    tweets = []
    for tweet in sntwitter.TwitterSearchScraper(query).get_items():
        #print(vars(tweet))
        #break
        if len(tweets) == limit:
            break
        else:
            tweets.append([tweet.date, 
                           tweet.user.username, 
                           tweet.content, 
                           tweet.likeCount, 
                           tweet.replyCount, 
                           tweet.retweetCount])
    return tweets


## all_queries is defined as all the queries we want to run. 
all_queries = get_queries()

records = []
for query in all_queries:
    records.extend(get_tweets(query))

In [2]:
## Create a DataFrame which will store the tweets
df_tweets = pd.DataFrame(records,columns =['Date','User','Tweet','TweetLikes','TweetReplies','RetweetCount']) 
df_tweets.shape

(37000, 6)

In [3]:
## Get all the queries
def queries_columns():
    '''Gets multiple queries. Limit has to match with limit in get_tweets.'''
    test = all_queries
    limit = 100
    large_list = []
    for query in test: 
        for i in range(limit):
            large_list.append(query)
    return large_list

large_list = queries_columns()

In [4]:
## Create a dataframe for our df_queries.
df_queries = pd.DataFrame(large_list,columns =['Query']) 

In [5]:
## We combine our tweets dataframe with the respective queries used to get them. 
combined  = pd.concat([df_queries,df_tweets],axis =1)
combined.head()

Unnamed: 0,Query,Date,User,Tweet,TweetLikes,TweetReplies,RetweetCount
0,Arsenal until:2022-04-23,2022-04-22 23:59:56+00:00,RoweThings,Can’t seem to find any serious buyers tonight ...,0,3,0
1,Arsenal until:2022-04-23,2022-04-22 23:59:54+00:00,Ancurancuran01,Gabsus solusi masalah striker Arsenal kah?? ht...,0,0,0
2,Arsenal until:2022-04-23,2022-04-22 23:59:54+00:00,celsky07,@el3estuvodemas @m_delprado_7 @agus____1974 @j...,1,1,0
3,Arsenal until:2022-04-23,2022-04-22 23:59:53+00:00,fachry4,Payah nih yang ngancem bom. Jadi ga maen dah d...,0,0,0
4,Arsenal until:2022-04-23,2022-04-22 23:59:51+00:00,GunnersDXB,If this happens the arsenal are acc back. http...,1,0,0


In [6]:
## We merge the df_queries with our team_stats to see if a team won on that day or not.  
df = pd.merge(combined,team_stats[['Query','Result','Team']],on='Query', how='inner')

In [7]:
## We replace the Result columns with 1 for wins and 0 for Losses or Ties.
df.loc[:,'Result'] = df['Result'].replace('W',1).replace('L',0).replace('D',0)
df.head()

Unnamed: 0,Query,Date,User,Tweet,TweetLikes,TweetReplies,RetweetCount,Result,Team
0,Arsenal until:2022-04-23,2022-04-22 23:59:56+00:00,RoweThings,Can’t seem to find any serious buyers tonight ...,0,3,0,1,Arsenal
1,Arsenal until:2022-04-23,2022-04-22 23:59:54+00:00,Ancurancuran01,Gabsus solusi masalah striker Arsenal kah?? ht...,0,0,0,1,Arsenal
2,Arsenal until:2022-04-23,2022-04-22 23:59:54+00:00,celsky07,@el3estuvodemas @m_delprado_7 @agus____1974 @j...,1,1,0,1,Arsenal
3,Arsenal until:2022-04-23,2022-04-22 23:59:53+00:00,fachry4,Payah nih yang ngancem bom. Jadi ga maen dah d...,0,0,0,1,Arsenal
4,Arsenal until:2022-04-23,2022-04-22 23:59:51+00:00,GunnersDXB,If this happens the arsenal are acc back. http...,1,0,0,1,Arsenal


In [8]:
## Save the DataFrame
df.to_csv('.\..\Data\Finalized_DataFrame.csv')

In [9]:
## Check the Shape
df.shape

(37000, 9)