# EPL_Tweets_Scraper
- A Script which can get multiple queries of tweets from Twitter. 
- We use this script to collect tweets from Arsenal and Tottenham for 380 games spread across 5 different seasons. 

In [1]:
## Import Required Packages.
import snscrape.modules.twitter as sntwitter
import pandas as pd

### Load our Team_Stats Dataframe containing information on which day a team played and if they won. 
team_stats = pd.read_csv(r'..\Data\Team_Stats2.csv')


## Formatting Date
team_stats['Next Day'] = pd.to_datetime(team_stats['Next Day'])
team_stats['Next Day'] = pd.to_datetime(team_stats['Next Day'], format = '%yyyy-%mm-%dd')
team_stats['Next Day'] = team_stats['Next Day'].apply(str)
team_stats['Next Day'] = team_stats['Next Day'].str[:-9]

### Creating combined query column and converting to list. 
team_stats['combined'] = team_stats[['Team','Next Day']].values.tolist()
query_list = list(team_stats['combined'])

## Defining get_all_queries
def get_queries():
    '''Returns a list of queries we want to run in the Twitter Scraper.'''
    queries = []
    for value in query_list:
        query = "{} until:{}".format(value[0],value[1])
        queries.append(query)
    return queries

## Defining our get_tweets function
def get_tweets(query,limit =1000): 
    '''Function returns a list of tweets'''
    tweets = []
    for tweet in sntwitter.TwitterSearchScraper(query).get_items():
        #print(vars(tweet))
        #break
        if len(tweets) == limit:
            break
        else:
            tweets.append([tweet.date, 
                           tweet.user.username, 
                           tweet.content, 
                           tweet.likeCount, 
                           tweet.replyCount, 
                           tweet.retweetCount])
    return tweets


## all_queries is defined as all the queries we want to run. 
all_queries = get_queries()

records = []
for query in all_queries:
    records.extend(get_tweets(query))

In [27]:
## Create a DataFrame which will store the tweets
df_tweets = pd.DataFrame(records,columns =['Date','User','Tweet','TweetLikes','TweetReplies','RetweetCount']) 
df_tweets.shape

(370000, 6)

In [28]:
## Get all the queries
def queries_columns():
    '''Gets multiple queries. Limit has to match with limit in get_tweets.'''
    test = all_queries
    limit = 1000
    large_list = []
    for query in test: 
        for i in range(limit):
            large_list.append(query)
    return large_list

large_list = queries_columns()

In [29]:
## Create a dataframe for our df_queries.
df_queries = pd.DataFrame(large_list,columns =['Query2']) 

In [30]:
## We combine our tweets dataframe with the respective queries used to get them. 
combined  = pd.concat([df_queries,df_tweets],axis =1)
combined.head()

Unnamed: 0,Query2,Date,User,Tweet,TweetLikes,TweetReplies,RetweetCount
0,Arsenal until:2022-04-24,2022-04-23 23:59:47+00:00,ArendseRiyaaz,@JackAFC01 @LUHG450 @1Thegameis Because you're...,1,1,0
1,Arsenal until:2022-04-24,2022-04-23 23:59:41+00:00,meieraberehok,@arsenal_lady bei ihm werde ich einfach immer ...,1,0,0
2,Arsenal until:2022-04-24,2022-04-23 23:59:39+00:00,NeilDenAFC,5 games to go\n\n5 cup finals \n\n5 games to U...,0,0,0
3,Arsenal until:2022-04-24,2022-04-23 23:59:37+00:00,lomekian,@Arsenal @HectorBellerin VAMOS @HectorBellerin...,18,0,0
4,Arsenal until:2022-04-24,2022-04-23 23:59:32+00:00,FirstOfficerMax,@Cristiano Come to @Arsenal 🐐.. so many assist...,0,0,0


In [32]:
## We merge the df_queries with our team_stats to see if a team won on that day or not.  
df = pd.merge(combined,team_stats[['Query2','Result','Team']],on='Query2', how='inner')

In [36]:
## We replace the Result columns with 1 for wins and 0 for Losses or Ties.
df.loc[:,'Result'] = df['Result'].replace('W',1).replace('L',0).replace('D',0)
df.sample(5)

Unnamed: 0,Query2,Date,User,Tweet,TweetLikes,TweetReplies,RetweetCount,Result,Team
257152,Tottenham until:2018-12-27,2018-12-26 23:23:38+00:00,STORMTIGER5,@HKane @premierleague @SpursOfficial Can't sta...,0,0,0,1,Tottenham
243691,Arsenal until:2019-02-10,2019-02-09 22:39:51+00:00,BassiGooner,@reggie_10_ @Arsenal @premierleague I’m glad t...,0,1,0,1,Arsenal
187289,Tottenham until:2019-12-08,2019-12-07 23:16:58+00:00,3boyz2feed,I don’t think Buzz likes Fonzi too much! #elf...,0,0,0,1,Tottenham
229396,Tottenham until:2019-04-14,2019-04-13 21:48:44+00:00,Z_PearsonTHFC,Piece of trivia to remember - first hattrick s...,39,0,1,1,Tottenham
74987,Arsenal until:2021-05-03,2021-05-02 21:49:57+00:00,levingonzalo,desde el empate con arsenal se sabía que iba a...,0,0,0,1,Arsenal


In [37]:
## Save the DataFrame
df.to_csv('.\..\Data\Finalized_DataFrame_All_Data_2.csv')

In [34]:
## Check the Shape
df.shape

(370000, 9)