In [1]:
from datetime import datetime, timedelta

# Resolving paths in a platform agnostic way.
from os.path import dirname, join, realpath

import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Manipulating the raw data to save it in a ``.csv`` files.
from pandas import DataFrame, DatetimeIndex
from pandas import concat as concat_df
from pandas import date_range

In [2]:
def is_interactive():
    """Check if the script is being run interactively."""
    import __main__ as main

    return not hasattr(main, "__file__")


if is_interactive():
    SCRIPT_DIR = dirname(realpath("__file__"))
else:
    SCRIPT_DIR = dirname(realpath(__file__))

# "../data"
DATA_DIR = join(dirname(SCRIPT_DIR), "data")

In [3]:
CRYPTOCURRENCIES = [
    "BTC",
    "ETH",
    "DOGE",
    "SOL",
    "AVAX",
]

prices_dataframe = pd.DataFrame()

for cryptocurrency in CRYPTOCURRENCIES:
    temp_dataframe = pd.read_csv(
        join(
            DATA_DIR,
            "raw",
            "crypto",
            f"{cryptocurrency.lower()}_2022_03_05-2022_03_11_minute.csv",
        )
    )

    temp_dataframe["time"] = temp_dataframe["time"].transform(
        datetime.fromtimestamp
    )

    temp_dataframe["cryptocurrency"] = cryptocurrency
    prices_dataframe = pd.concat([prices_dataframe, temp_dataframe])

prices_dataframe.head()

Unnamed: 0.1,Unnamed: 0,time,high,low,open,volumefrom,volumeto,close,conversionType,conversionSymbol,cryptocurrency
0,0,2022-03-04 12:50:00,40805.94,40782.69,40787.05,20.21,825274.57,40783.59,direct,,BTC
1,1,2022-03-04 12:51:00,40793.84,40782.67,40783.59,0.7493,30560.29,40793.84,direct,,BTC
2,2,2022-03-04 12:52:00,40798.21,40791.96,40793.84,2.376,96918.76,40798.03,direct,,BTC
3,3,2022-03-04 12:53:00,40816.49,40797.21,40798.03,4.488,183210.08,40816.49,direct,,BTC
4,4,2022-03-04 12:54:00,40853.58,40816.49,40816.49,26.03,1063336.27,40838.92,direct,,BTC


In [4]:
NUM_DAYS = 7
# Decided based on limitations of API at the time of data collection.
LAST_DAY = datetime(2022, 3, 11)
DATE_RANGE = date_range(end=LAST_DAY, periods=NUM_DAYS)

tweets_dataframe = pd.DataFrame()

for date in DATE_RANGE:
    temp_dataframe = pd.read_csv(
        join(
            DATA_DIR,
            "raw",
            "twitter",
            f"tweets-{date.strftime('%Y-%m-%d')}.csv",
        )
    )

    temp_dataframe["created_at"] = pd.to_datetime(temp_dataframe["created_at"])

    tweets_dataframe = pd.concat([tweets_dataframe, temp_dataframe])

tweets_dataframe.head()

Unnamed: 0.1,Unnamed: 0,text,retweet_count,favorite_count,followers_count,verified,listed_count,created_at,hashtags,name
0,0,"""It is an open ledger, trying to sneak lots of...",151,520,2437101,True,10788,2022-03-05 09:33:07+00:00,"[{'text': 'crypto', 'indices': [61, 68]}]",Bitcoin News
1,1,“The #crypto market today has a market capital...,87,245,2437101,True,10788,2022-03-05 12:03:14+00:00,"[{'text': 'crypto', 'indices': [5, 12]}]",Bitcoin News
2,2,G7 countries and the EU are looking at ways to...,95,245,2437101,True,10788,2022-03-05 13:33:29+00:00,"[{'text': 'cryptocurrency', 'indices': [88, 10...",Bitcoin News
3,3,JUST IN: 🇸🇬 Singapore has introduced sanctions...,149,1043,708057,True,1449,2022-03-05 11:01:13+00:00,"[{'text': 'cryptocurrency', 'indices': [77, 92]}]",Watcher.Guru
4,4,Make sure you check in on your bros in the str...,45,149,7822,True,80,2022-03-05 14:45:25+00:00,[],Ian Heinisch


In [8]:
tweets_dataframe.to_csv(
    join(
        DATA_DIR,
        "processed",
        "twitter",
        f"tweets"
        f"_{(DATE_RANGE[0]).strftime('%Y_%m_%d')}"
        f"-{DATE_RANGE[-1].strftime('%Y_%m_%d')}.csv",
    )
)