# Import Libraries

In [1]:
# plotting
import matplotlib.pyplot as plt

# manipulate data
import pandas as pd

# more plotting
import seaborn as sns

# custom functions
from gtrends.get_daily_data import og_get_daily_trend as get_daily_trend

# pytrends library
from pytrends.request import TrendReq

pytrend = TrendReq()

# make pandas types compatibile with matplotlibs
pd.plotting.register_matplotlib_converters()

sns_dict = {
    "axes.facecolor": "1",
    "grid.color": "0.9",
    "font.family": ["sans-serif"],
    "font.sans-serif": ["Yanone Kaffeesatz", "DejaVu Sans"],
}

sns.set_style(style="whitegrid", rc=sns_dict)
sns.set_palette(palette="deep")
%matplotlib inline

plt.rcParams["figure.figsize"] = [12, 6]
plt.rcParams["figure.dpi"] = 100

# Define Default Values for Data Retrieval

In [2]:
keywords = ["Mario Draghi", "Christine Lagarde", "Euro", "European Central Bank"]

start_date = "2012-01-01"
end_date = "2021-08-31"

# Monthly Data

## Retrieve Monthly Data

In [3]:
monthly_data = pd.DataFrame(index=pd.date_range(start_date, end_date)).asfreq("MS")

for keyword in keywords:

    if keyword == "Mario Draghi":
        timespan = f"{start_date} 2019-10-31"  # end of his mandate
    elif keyword == "Christine Lagarde":
        timespan = f"2019-11-01 {end_date}"
    else:
        timespan = f"{start_date} {end_date}"

    print(f"Retrieving monthly data for {keyword}, timespan = {timespan}")
    pytrend.build_payload([keyword], timeframe=timespan)

    downloaded_data = (
        # retrieve the data
        pytrend.interest_over_time()
        # drop unneded col
        .drop(columns="isPartial")
        # cast as integers
        .astype("int")
        # rename columns in snakecase
        .rename(columns={keyword: keyword.replace(" ", "_").lower()})
    )

    monthly_data = monthly_data.merge(
        downloaded_data, how="left", left_index=True, right_index=True
    )

Retrieving monthly data for Mario Draghi, timespan = 2012-01-01 2019-10-31
Retrieving monthly data for Christine Lagarde, timespan = 2019-11-01 2021-08-31
Retrieving monthly data for Euro, timespan = 2012-01-01 2021-08-31
Retrieving monthly data for European Central Bank, timespan = 2012-01-01 2021-08-31


In [4]:
monthly_data

Unnamed: 0,mario_draghi,christine_lagarde,euro,european_central_bank
2012-01-01,25.0,,11,89
2012-02-01,25.0,,11,99
2012-03-01,15.0,,10,77
2012-04-01,23.0,,11,74
2012-05-01,22.0,,17,86
...,...,...,...,...
2021-04-01,,,15,38
2021-05-01,,,16,46
2021-06-01,,,100,30
2021-07-01,,,49,31


In [5]:
monthly_data.to_csv("../02-data/python/gtrends-monthly-all")

# Daily Data

## With `pytrends`-like method

`pytrends`' method to retrieve daily data is to retrieve daily data in monthly batches and then scale this data using the monthly observations. In principle, this is better to be reimplemented. The critical part about this task is creating a function that given the month number returns the last day. This will be needed to generate monthly time spans to iterate over. The function is taken from [here](https://stackoverflow.com/questions/42950/get-last-day-of-the-month-in-python).

In [60]:
from calendar import monthrange
from datetime import date, datetime
from time import sleep

from tqdm import tqdm


def _make_timespan(year: int, month: int) -> str:
    start_date = date(year, month, 1)
    end_date = date(year, month, monthrange(year, month)[1])

    return start_date.strftime("%Y-%m-%d") + " " + end_date.strftime("%Y-%m-%d")


def get_daily_data(
    keyword: str, start: str, end: str, sleep_for: int = 2, verbose: bool = True
) -> pd.DataFrame:

    # create date objects
    start_date = datetime.strptime(start, "%Y-%m-%d").date()
    end_date = datetime.strptime(end, "%Y-%m-%d").date()

    # define start and end month/year
    start_month = start_date.month
    start_year = start_date.year

    end_month = end_date.month
    end_year = end_date.year

    # create the df beforehand
    keyword_data = pd.DataFrame(index=pd.date_range(start_date, end_date)).asfreq("D")

    # define list of timespans relative to which we retrieve the data
    timespans = [
        _make_timespan(year, month)
        for year in range(start_year, stop_year + 1)
        for month in (
            range(start_month, stop_month + 1)
            if year == stop_year
            else range(1, 12 + 1)
        )
    ]

    pytrend = TrendReq()

    # use tqdm to display a progress bar
    for period in timespans:

        if verbose:
            print(f"Retrieving SVI for {keyword} for period {period}")

        pytrend.build_payload([keyword], timeframe=period)

        period_data = (
            pytrend.interest_over_time()
            .drop(columns="isPartial")
            .astype("int")
            .rename(columns={f"{keyword}": f"{keyword}".lower().replace(" ", "_")})
        )

        keyword_data = keyword_data.append(period_data)

        sleep(sleep_for)  # else google bans you

    return keyword_data

In [61]:
get_daily_data("Euro", "2012-01-01", "2021-08-31")

Retrieving SVI for Euro for period 2012-01-01 2012-01-31
Retrieving SVI for Euro for period 2012-02-01 2012-02-29
Retrieving SVI for Euro for period 2012-03-01 2012-03-31
Retrieving SVI for Euro for period 2012-04-01 2012-04-30
Retrieving SVI for Euro for period 2012-05-01 2012-05-31
Retrieving SVI for Euro for period 2012-06-01 2012-06-30
Retrieving SVI for Euro for period 2012-07-01 2012-07-31
Retrieving SVI for Euro for period 2012-08-01 2012-08-31
Retrieving SVI for Euro for period 2012-09-01 2012-09-30
Retrieving SVI for Euro for period 2012-10-01 2012-10-31
Retrieving SVI for Euro for period 2012-11-01 2012-11-30
Retrieving SVI for Euro for period 2012-12-01 2012-12-31
Retrieving SVI for Euro for period 2013-01-01 2013-01-31
Retrieving SVI for Euro for period 2013-02-01 2013-02-28
Retrieving SVI for Euro for period 2013-03-01 2013-03-31
Retrieving SVI for Euro for period 2013-04-01 2013-04-30
Retrieving SVI for Euro for period 2013-05-01 2013-05-31
Retrieving SVI for Euro for per

Unnamed: 0,euro
2012-01-01,
2012-01-02,
2012-01-03,
2012-01-04,
2012-01-05,
...,...
2014-08-27,96.0
2014-08-28,95.0
2014-08-29,97.0
2014-08-30,87.0


## With `local_maxima`

We set up the data and the dataframe that will store the variables we need.

In [6]:
keywords = ["Mario Draghi", "Christine Lagarde", "Euro", "European Central Bank"]

start_date = "2012-01-01"
end_date = "2021-08-31"

delta = 269
overlap = [30, 100]

cols = [f"{keyword}_{window}".lower().replace(" ", "_") for keyword in keywords for window in overlap]

daily_data_local_maxima = pd.DataFrame(index=pd.date_range(start_date, end_date), columns=cols).asfreq("D")

daily_data_local_maxima.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3531 entries, 2012-01-01 to 2021-08-31
Freq: D
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   mario_draghi_30            0 non-null      object
 1   mario_draghi_100           0 non-null      object
 2   christine_lagarde_30       0 non-null      object
 3   christine_lagarde_100      0 non-null      object
 4   euro_30                    0 non-null      object
 5   euro_100                   0 non-null      object
 6   european_central_bank_30   0 non-null      object
 7   european_central_bank_100  0 non-null      object
dtypes: object(8)
memory usage: 248.3+ KB


In [7]:
for keyword in keywords:

    start_date = "2012-01-01"
    end_date = "2021-08-31"

    if keyword == "Mario Draghi":
        start = start_date
        end = "2019-10-31"
    elif keyword == "Christine Lagarde":
        start = "2019-11-01"
        end = end_date

        for window in overlap:

            print(
                f"Retrieving monthly data for {keyword}, timespan = {start_date} {end_date}, overlap = {window} days"
            )

            daily_data_local_maxima[
                f"{keyword}_{overlap}".replace(" ", "_").lower()
            ] = get_daily_trend(
                trendreq=pytrend,
                keyword=keyword,
                start=start,
                end=end,
                delta=delta,
                overlap=window,
                verbose=True
            ).drop(
                "isPartial", axis=1
            )

Retrieving monthly data for Christine Lagarde, timespan = 2012-01-01 2021-08-31, overlap = 30 days
Fetching 'Christine Lagarde' for period: 2020-12-05 2021-08-31
Fetching 'Christine Lagarde' for period: 2020-04-10 2021-01-04
Normalize by overlapping period:2020-12-05 2021-01-04
Fetching 'Christine Lagarde' for period: 2019-08-15 2020-05-10
Normalize by overlapping period:2020-04-10 2020-05-10


KeyError: "['isPartial'] not found in axis"