# Script to scrape old twitter posts older than 1-week
<b>Date Created:</b> 5 July 2022<br>
<b>Modifications:</b>
- 6 August 2022: Cleaning and tokenization of tweets

<b>Components:</b>
- Scraping
- Cleaning
- Tokenization

<b>Output:</b> CSV<br>

In [1]:
# Uncomment for installation of tweet-preprocessor package
#import sys
#!{sys.executable} -m pip install nltk

In [2]:
# Uncomment to install snscrape
#!{sys.executable} -m pip install snscrape

In [3]:
# Uncomment to install tweepy
#!{sys.executable} -m pip install tweepy

# Scraping data from tweets based on keywords

<b>Options available include:</b>
- Keyword to search for
- Start date and end date for tweets to be scraped
- Output file name
- Number of tweets to be scraped

In [4]:
# Import scraping libraries
import tweepy
import csv
import pandas as pd

In [5]:
# Provide your own credentials here.
consumer_key = "***"
consumer_secret = "***"
access_token = "***"
access_token_secret = "***"

In [6]:
# Import sns scraper for twitter
import snscrape.modules.twitter as sntwitter

In [7]:
# SETTING: Update your query keyword, start date, end date, file name to be saved, number of tweets to scrape
keyword = "BlueSG"
start_dt = "2022-01-01"
end_dt = "2022-04-07"
file_name = "BlueSG_tweets_cleaned.csv"
num_tweets_to_scrape = 100

In [8]:
# Creates a csv in which you want to store the data.
csvFile = open(file_name, 'a') 
csvWriter = csv.writer(csvFile)

In [9]:
# Iterate twitter and scrape tweets based on keyword
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(keyword + " since:" + start_dt + " until:" + end_dt).get_items()) :
        if i > num_tweets_to_scrape :
            break
        print(tweet.date) # To confirm scraping is performed
        
        csvWriter.writerow([tweet.date, tweet.content.encode("utf-8")]) # Write to csv

2022-04-06 03:27:50+00:00
2022-04-06 03:27:49+00:00
2022-04-06 03:23:26+00:00
2022-04-05 04:57:01+00:00
2022-04-04 08:28:44+00:00
2022-04-04 06:23:26+00:00
2022-04-03 17:46:20+00:00
2022-04-02 13:47:35+00:00
2022-04-02 04:53:18+00:00
2022-03-28 15:37:31+00:00
2022-03-27 10:07:23+00:00
2022-03-27 08:26:16+00:00
2022-03-24 01:28:05+00:00
2022-03-22 09:29:15+00:00
2022-03-21 15:27:15+00:00
2022-03-21 11:35:28+00:00
2022-03-21 04:11:03+00:00
2022-03-21 04:11:03+00:00
2022-03-14 16:22:38+00:00
2022-03-14 04:55:31+00:00
2022-03-13 09:59:47+00:00
2022-03-10 15:56:01+00:00
2022-03-08 10:02:28+00:00
2022-03-07 15:22:15+00:00
2022-03-07 05:05:34+00:00
2022-03-07 04:48:46+00:00
2022-03-07 04:29:43+00:00
2022-03-07 04:27:48+00:00
2022-03-07 04:26:54+00:00
2022-03-07 03:09:56+00:00
2022-03-03 05:19:57+00:00
2022-03-02 11:25:37+00:00
2022-02-28 13:19:29+00:00
2022-02-27 16:28:33+00:00
2022-02-26 12:02:00+00:00
2022-02-26 09:20:55+00:00
2022-02-26 08:57:54+00:00
2022-02-26 08:52:04+00:00
2022-02-26 0

In [10]:
# Close File
csvFile.close()

# Data Cleaning for data output for analysis

<b>Cleaning performed includes:</b>
- Decoding HTML
- Removal of links, hash, username and punctuations
- Removal of stopwords

In [11]:
# Import cleaning libraries
import html
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [12]:
# Uncomment if nltk library installed does not contain 'english' resources or function to tokenize
#import nltk
#nltk.download('stopwords')
#nltk.download('punkt')

In [13]:
# SETTING: Update your critical words that could be a common stop word
criticals = ['not']

In [14]:
# Open and rename columns
df = pd.read_csv(file_name, header = None)
df.columns = ["Date", "Tweets"]

In [16]:
# Replacing newline with spaces
for i in range(len(tweets)):
    x = tweets[i].replace("\n"," ")
    
    # Removal of html script
    tweets[i] = html.unescape(x)

In [17]:
# Cleaning tweet with regex - Removing starting letter b, Username, media & website
for i in range(len(tweets)):
    tweets[i] = re.sub(r"^b|(@[A-Za-z0–9_]+)|[^\w\s]|http\S+", "", tweets[i])

In [18]:
# Prearation of stopwords
sw = stopwords.words('english')

# Removing words that could be a common stopword but important in tweet
for critical in criticals:
    sw.remove(critical)

In [19]:
# Tokenizing the tweets
for i in range(len(tweets)):
    tweets[i] = word_tokenize(tweets[i])

In [20]:
# Removal of stopwords from tweets
for i in range(len(tweets)):
    tweets[i] = [word for word in tweets[i] if not word in sw]

In [21]:
# Replacing df tweet columns with tokenized cleaned tweets
df['Tweets'] = tweets
df.head()

Unnamed: 0,Date,Tweets
0,2022-04-06 03:27:50+00:00,"[people, use, blueSG, portable, motel, help, d..."
1,2022-04-06 03:27:49+00:00,"[anyone, WA, wan, na, lend, car, D, 0, acciden..."
2,2022-04-06 03:23:26+00:00,"[BlueSG, car, spotted, Woodlands, Checkpoint, ..."
3,2022-04-05 04:57:01+00:00,"[I, wouldnt, caught, dead, driving, bluesg]"
4,2022-04-04 08:28:44+00:00,"[yoongi, got, drivers, license, car, leh, blue..."


In [22]:
# Overwrite & save file back into csv. for further analysis
df.to_csv(file_name, index = False)