In [1]:
# import libraries
import pandas as pd
import json
from tqdm import tqdm
import snscrape.modules.twitter as twtr
from pymongo import MongoClient

In [2]:
# Parameters
TWITTER_HANDLE = 'petrogustavo' # Gustavo Petro's twitter handle
BEGIN_DATE = '2022-08-07' # Initial day of the Gustavo Petro's presidency
END_DATE = '2023-06-24' # Current day

# Section 1. Scraping Tweets

For this section we'll make use of the sns package, that allows us to extract tweets from Twitter without the need of a Twitter API. The package is available at (INSERT LINK). 

We start by using the TwitterProfileScraper function to obtain all tweets related to the user @petrogustavo. The function returns a list of dictionaries, where each dictionary corresponds to a tweet. The function takes as input the username, the number of tweets to be extracted, and the language of the tweets. 

In [7]:
import datetime
import sys
import json
sys.stdout.encoding = 'utf-8'
scraper = twtr.TwitterSearchScraper(f'from:{TWITTER_HANDLE}'
                                    #since:{BEGIN_DATE} until:{END_DATE}'
                                    )
NUMBER_OF_TWEETS = int(10_000) # Number of tweets to be scraped
# Get the tweets
profile_scraper = twtr.TwitterProfileScraper(TWITTER_HANDLE)
tweets = profile_scraper.get_items()
# print all properties of the tweet
# save the tweets in a pandas dataframe
with open('data/tweets.json', 'w+', encoding='utf-8') as f:
    f.write('{ "tweet":[')
    # iterate over the tweets using tqdm to show a progress bar NUMBER_OF_TWEETS times
    for i,tweet in tqdm(enumerate(tweets), total = NUMBER_OF_TWEETS, desc='Scraping tweets', unit='tweets'):
        # create a dictionary with the tweet information
        tweet_dict = {'id': tweet.id,
                      'date': tweet.date.strftime('%Y-%m-%d %H:%M:%S'),
                        'content': tweet.retweetedTweet.rawContent if tweet.retweetedTweet else tweet.rawContent,
                        'url': tweet.url,
                        'replyCount': tweet.replyCount if tweet.replyCount else 0,
                        'retweetCount': tweet.retweetCount if tweet.retweetCount else 0,
                        'likeCount': tweet.likeCount if tweet.likeCount else 0,
                        'quoteCount': tweet.quoteCount if tweet.quoteCount else 0,
                        'isRetweet': True if tweet.retweetedTweet else False,
                        'isReply': True if tweet.inReplyToTweetId else False,
                        'isQuote': True if tweet.quotedTweet else False,
                        'retweetedFromUser' : tweet.retweetedTweet.user.username if tweet.retweetedTweet else None,
                        'mentionedUsers': [user.username for user in tweet.mentionedUsers] if tweet.mentionedUsers else None,
                        #'hashtags': [hashtag.text for hashtag in tweet.hashtags],
                        'hasMedia': True if tweet.media else False,
        }
        # save the tweet in the json file
        json.dump(tweet_dict, f, ensure_ascii=False);
        # add a comma to separate the tweets
        f.write(',');
        # break condition
        if tweet.date.strftime('%Y-%m-%d') < BEGIN_DATE:
            break
    print('Number of tweets saved: ', i)
    # remove the last comma
    f.seek(f.tell() - 1, 0)
    f.write(']}')



Scraping tweets:   3%|▎         | 343/10000 [00:16<07:11, 22.38tweets/s]Unavailable user in card on tweet 1666268373640966145
User 897095448242130944 not found in user refs in card on tweet 1666268373640966145
Unavailable user in card on tweet 1666272920006451204
User 897095448242130944 not found in user refs in card on tweet 1666272920006451204
Scraping tweets:   8%|▊         | 758/10000 [00:34<06:36, 23.32tweets/s]Unavailable user in card on tweet 1658563722133098513
User 897095448242130944 not found in user refs in card on tweet 1658563722133098513
Unavailable user in card on tweet 1658585152514301953
User 897095448242130944 not found in user refs in card on tweet 1658585152514301953
Scraping tweets:  14%|█▍        | 1425/10000 [01:03<05:53, 24.24tweets/s]Unavailable user in card on tweet 1646098122676871168
User 897095448242130944 not found in user refs in card on tweet 1646098122676871168
Unavailable user in card on tweet 1646121710456918016
User 897095448242130944 not found in us

Number of tweets saved:  3243





# Section 2. Scraping USD/COP exchange rate historical data

First we need to load the read the data from the API and load it onto the database.

In [21]:
# Setup forex API
import requests
import json
# load API key from file
with open('creds/forex_api_token.txt', 'r') as f:
    api_token = f.read()
# Set uo API request
base_url = "https://api.markets.sh/api/v1/symbols/{symbols}/{method}?".format(symbols='USDCOP', method='quotes')
params = {'api_token': api_token, 'from': BEGIN_DATE, 'to': END_DATE}
# make request
response = requests.get(base_url, params=params)
# save response to file if successful
if response.status_code == 200:
    print('Success!')
    # create file if it doesn't exist
    with open('data/forex_data.json', 'w+') as f:
        json.dump(response.json(), f)
else:
    print('Error! Status code: {}'.format(response.status_code))


Success!


# Section 3. Database configuration

Once we've pulled all the tweets we need, we'll store them in a database. We will use MongoDB, a NoSQL database that stores data in JSON-like documents, taking advantage of the fact that the tweets are already in JSON format. And that NoSQL databases are more flexible than SQL databases, allowing us to store data without a predefined schema, which is useful since many tweets don't have the same fields (for example, retweets or replies don't have the same fields as regular tweets).

Let's begin with the database configuration, if you require the credentials to access the database, please contact me at camilotorresmestra@gmail.com 

In [8]:
# Connect to MongoDB
import json
with open('creds/mongo.json') as f:
    data = json.load(f)
    __db__ = data['db']
    __usr__ = data['user']
    __pass__ = data['password']

connection_str = "mongodb+srv://{usr}:{passwrd}@{db}.xwk6g0f.mongodb.net/?retryWrites=true&w=majority".format(
#connection_str = "mongodb+srv://{usr}:{passwrd}@{db}.durge5s.mongodb.net/?retryWrites=true&w=majority".format(
    
    usr=__usr__, passwrd=__pass__, db=__db__
);
client = MongoClient(connection_str)

1. Create a collection called 'tweets' in the database 'volfefe' in MongoDB

In [9]:
# Create database and collection if they don't exist already
db = client['mlds3']
if "tweets" in db.list_collection_names():
    db.drop_collection("tweets")
collection = db['tweets']

Now, insert the scraped tweets into the database using the following command:

In [13]:
# only execute this cell when the tweets collection is full
data = []
with open('data/tweets.json', 'r', encoding='utf-8') as f:
    data = json.load(f)['tweet']
collection.insert_many(data)


<pymongo.results.InsertManyResult at 0x1a8450adff0>

Pull a tweet from the collection to verify the structure of the data:

In [14]:
collection.find_one()

{'_id': ObjectId('64977a0e21cc58945cb8f73d'),
 'id': 1671252387518988288,
 'date': '2023-06-20 20:23:14',
 'content': 'El hundimienho de la reforma laboral es muy grave. Demuestra que la voluntad de paz y de pacto social no existe en el poder económico. Dueños del capital y de los medios lograron cooptar el Congreso en contra de la dignidad del pueblo trabajador.\n\nCreen que las ganancias salen de la esclavitud, las largas jornadas y la completa inestabilidad laboral.\n\nEl gobierno del cambio no abandonará los intereses de la trabajadora y el trabajador.',
 'url': 'https://twitter.com/petrogustavo/status/1671252387518988288',
 'replyCount': 12195,
 'retweetCount': 11802,
 'likeCount': 34732,
 'quoteCount': 1660,
 'isRetweet': False,
 'isReply': False,
 'isQuote': False,
 'retweetedFromUser': None,
 'mentionedUsers': None,
 'hasMedia': False}

Now, let's create a collection called 'usd_cop' in the same database in MongoDB to store the USD/COP exchange rate data.

In [16]:
# load data from file
data = []
with open('data/forex_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)['bars']

# Create database and collection if they don't exist already
db = client['mlds3']
if "forex" in db.list_collection_names():
    db.drop_collection("forex")
collection = db['forex']

# insert data into collection
collection.insert_many(data)


<pymongo.results.InsertManyResult at 0x1a842e33a90>

Pull a document from the collection to verify the structure of the data:

In [17]:
collection.find_one()

{'_id': ObjectId('64977aa121cc58945cb903e9'),
 'close': 4343.35,
 'date': '2022-08-07',
 'volume': 2}

# Section 4 . Data cleaning and preprocessing
