In [87]:
import time
import pandas as pd

from pytrends.request import TrendReq
from tqdm import tqdm

# Local imports
from utils import get_sampled_countries, is_leap_year
from testchecks import is_valid_country_query
from typerhints import CountryList, Countries, SearchTerm, HistoryFrame, HistoryFrameBlocks

from pytrends.request import TrendReq
from matplotlib import pyplot as plt

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Vizu settings
%matplotlib inline
plt.rcParams['font.size'] = 16
plt.rcParams['figure.figsize'] = (10, 6)

# Local imports
from utils import load_countries, plot_history, histories_to_pandas
from search_engine import SearchEngine

# reproducibility
import random
random.seed(1337)

In [88]:
COUNTRY_DIR = "docs/countries.txt"
COUNTRY_IGNORE_DIR = "docs/ignore.txt"
LANGUAGE = 'en-US'
TIME_ZONE = 360

pytrends = TrendReq(hl=LANGUAGE, tz=TIME_ZONE)
supported_countries = load_countries(filename=COUNTRY_DIR, ignore=COUNTRY_IGNORE_DIR)
fetch_interval = 1

In [89]:
def get_head_and_tail_values(hist):
    head_hist = hist.head(1)
    tail_hist = hist.tail(1)

    heads = []
    tails = []

    for name in hist.columns:
        hval = head_hist[name].values[0]
        tval = tail_hist[name].values[0]
        
        heads.append(hval)
        tails.append(tval)

    assert len(hist.columns) == len(heads), "length of heads doesnt match number of column names"
    assert len(hist.columns) == len(tails), "length of tails doesnt match number of column names"

    return heads, tails

In [90]:
YEAR = 2020
SEARCH_TERM = ["Obama"]
SEARCH_COUNTRIES = ["Germany"]
GEOCODE = "DE"

tmp_history = []

expected_days = 365
if is_leap_year(YEAR):
    expected_days = 366

# Build timeline
first_six_months = f"{YEAR}-1-1 {YEAR}-6-30"
last_six_months = f"{YEAR}-7-1 {YEAR}-12-31"
time_periods = [first_six_months, last_six_months]

for time_period in time_periods:
    # don't fetch too often I guess?
    time.sleep(fetch_interval)
    # create the payload for related queries
    pytrends.build_payload(SEARCH_TERM, timeframe=time_period, geo=GEOCODE)
    # request data from dataframe
    payload = pytrends.interest_over_time()
    if payload.empty:
        raise ValueError(f"Dropping request: {SEARCH_TERM} for geocode ({GEOCODE})")
    history_frame = payload.drop(columns=['isPartial'])
    tmp_history.append(history_frame)

# merge
first_last_day = first_six_months.split(" ")[1]
last_first_day = last_six_months.split(" ")[0]

merge_period = f"{first_last_day} {last_first_day}"

# create the payload for related queries
pytrends.build_payload(SEARCH_TERM, timeframe=merge_period, geo=GEOCODE)
# request data from dataframe
payload = pytrends.interest_over_time()
if payload.empty:
    raise ValueError(f"Dropping request: {SEARCH_TERM} for geocode ({GEOCODE})")

hframe = payload.drop(columns=['isPartial'])

# Implementation below accounts for multiple search terms,
# we're only using one (!!!)
merge_lists = []
scale_factors = []
for col_name in hframe.columns:
    tmp_list = hframe[col_name].values.tolist()
    merge_lists.append(tmp_list)
for mlist, mhist in zip(merge_lists, tmp_history):
    BP0, BP1 = mlist
    P1, P2 = get_head_and_tail_values(mhist)
    # Algo
    s = (BP1 * P1[-1]) / (BP0 * P2[0])
    scale_factors.append(s)

# Get scaled value
scaler = scale_factors[0]

time_periods

p1 = tmp_history[0]
p2 = tmp_history[1]
fullblock = pd.concat([p1, p2], axis=0)

# Scale second period to constrast
p2 = p2.mul(scaler, axis=0)

fullblock = pd.concat([p1, p2], axis=0)


            Obama
date             
2020-01-01     18
2020-01-02     20
2020-01-03     30
2020-01-04     31
2020-01-05     33
            Obama
date             
2020-12-27     12
2020-12-28      7
2020-12-29      8
2020-12-30      9
2020-12-31      9
            Obama
date             
2020-01-01   18.0
2020-01-02   20.0
2020-01-03   30.0
2020-01-04   31.0
2020-01-05   33.0
               Obama
date                
2020-12-27  5.680000
2020-12-28  3.313333
2020-12-29  3.786667
2020-12-30  4.260000
2020-12-31  4.260000


In [None]:
scale_factors

Unnamed: 0_level_0,Obama
date,Unnamed: 1_level_1
2020-06-30,100
2020-07-01,71


In [65]:
h1 = tmp_history[0]
h2 = tmp_history[1]

In [59]:
h1.head(), h2.head()

(            Donald Trump  Obama
 date                           
 2020-01-01            23     11
 2020-01-02            15     12
 2020-01-03            48     18
 2020-01-04            57     19
 2020-01-05            75     20,
             Donald Trump  Obama
 date                           
 2020-07-01             4      2
 2020-07-02             5      2
 2020-07-03             5      3
 2020-07-04            11      2
 2020-07-05             7      3)

In [None]:
hist1 = tmp_history[0]
hist2 = tmp_history[1]

In [None]:
get_head_and_tail_values(hist1)

In [None]:
hframe.head()

In [None]:
merge_values = []
for col_name in hframe.columns:
    merge_vals = hframe["Donald Trump"].values.tolist()
    merge_values.append(merge_vals)

In [None]:
merge_values

In [None]:
m1 = hframe[""]
m2 = hframe.iloc[1]

print("M1: ", m1)
print("M2: ", m2)

print(hframe.head())
# ULRIKs merge

# Combine the two time period from the timeline
combined_history = pd.concat(tmp_history)
assert combined_history.shape[0] == expected_days, f"Expected {expected_days} days in a year!"

final_block = combined_history