# Scraping Google Trends data using capstone keywords

In [1]:
# import libraries/modules
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=360, timeout=(40,25))
import json

## Load the keyword dictionary 

In [3]:
file1 = open('data/gdp_cat_queries_dict.json')
cat_topics_dict = json.load(file1)
cat_topics_dict.values()

In [4]:
cat_topics_dict

{'569': ['cineplex', 'cinema'],
 '23': ['dance', 'theatre'],
 '47': ['honda', 'ford'],
 '815': ['ford', 'honda'],
 '170': ['license', 'driving'],
 '249': ['blue cross', 'desjardins'],
 '71': ['pizza pizza', 'pizza'],
 '276': ['restaurant', 'restaurants'],
 '634': ['clinic', 'dr'],
 '250': ['hospital', 'hopital'],
 '168': ['fire', 'emergency'],
 '437': ['depression', 'anxiety'],
 '11': ['home depot', 'ikea'],
 '29': ['mls', 'real estate'],
 '96': ['remax', 'real real'],
 '18': ['kijiji', 'walmart'],
 '67': ['hotel', 'air canada'],
 '179': ['hotel', 'hotels'],
 '5': ['google', 'apple'],
 '68': ['shoes', 'boots'],
 '78': ['xbox', 'videotron'],
 '696': ['or', 'bijoux'],
 '329': ['acces d', 'cra'],
 '95': ['staples', 'desk'],
 '423': ['bankruptcy', 'hollywoodpq'],
 '279': ['credit', 'mortgage'],
 '1160': ['business', 'pret'],
 '813': ['student', 'tuition'],
 '466': ['mortgage', 'calculator'],
 '468': ['lease', 'calculator'],
 '46': ['farm', 'plant'],
 '750': ['forest', 'forestry'],
 '747': 

## Get all the keywords and create a list 

In [6]:
# 
keywords = [keyword for sublist in cat_topics_dict.values() for keyword in sublist]
len(keywords)

282

## Remove duplicates from a list

In [37]:
keywords = list(set(keywords))
len(keywords)

243

## Define the chunk size and split the list into chunks

In [42]:
# Define the chunk size
chunk_size = 15

# Split the list into chunks
chunks = [keywords[i:i + chunk_size] for i in range(0, len(keywords), chunk_size)]

len(chunks)

17

## Name each chunk


In [49]:
# Name each chunk as chunk_i starting from 1
for i, chunk in enumerate(chunks, 1):
    globals()[f'chunks{i}'] = chunk

## Define a function fetches the Google Trend data

In [57]:
def fetch_trends_in_batches(chunks,
                            batch_size=1,
                            cat=0, 
                            timeframe='2004-01-01 2024-03-31', 
                            geo='MU', 
                            gprop=''):
    
    pytrends = TrendReq(hl='en-US', tz=360, timeout=(40, 25))
    
    all_data = []
    num_batches = -(-len(chunks) // batch_size)  # Calculate number of batches
    
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        
        batch_keywords = chunks[start_idx:end_idx]
        
        pytrends.build_payload(batch_keywords, cat=cat, 
                               timeframe=timeframe,
                               geo=geo, 
                               gprop=gprop)
        data = pytrends.interest_over_time()
        
        all_data.append(data)

    combined_data = pd.concat(all_data, axis=1)
    return combined_data

## Store combined dataframes in a dictionary

In [63]:
combined_data_dict = {}

batch_size = 1
for i in range(1, len(chunks) + 1):
    chunks = globals()[f'chunks{i}']
    combined_data_dict[f'combined_data{i}'] = fetch_trends_in_batches(chunks, 
                                                                      batch_size=batch_size, 
                                                                      cat=0, 
                                                                      timeframe='2004-01-01 2024-03-31', 
                                                                      geo='MU', 
                                                                      gprop='')
    len(combined_data_dict)

## Combine and stored the dataframes 

In [72]:
# Determine the number of dataframes stored in combined_data_dict
num_dataframes = len(combined_data_dict)

# Initialize an empty list to store dataframes
dataframes = []

# Loop through the keys to access each dataframe
for i in range(1, num_dataframes + 1):
    key = f'combined_data{i}'
    if key in combined_data_dict:
        dataframes.append(combined_data_dict[key])

# Concatenate the dataframes along the date index
combined_df = pd.concat(dataframes, axis=1)

combined_df = combined_df.loc[:, ~combined_df.columns.str.endswith('isPartial')]

# Reset index to make date index
combined_df.reset_index(inplace=True)

# Drop duplicate date columns
combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]

# Set date column as index
combined_df.set_index('date', inplace=True)

combined_df

Unnamed: 0_level_0,depression,construction,fashion,jobs,bbc,domain,avg,forest,pizza pizza,hotels,...,college,covid,developer,distribution,real estate,portail,dashboard,election results,appliances,architecture
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-01,0,0,0,100,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2004-02-01,0,0,0,0,0,0,0,0,0,0,...,0,70,0,0,0,0,0,0,0,0
2004-03-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2004-04-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2004-05-01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-01,3,16,17,16,48,23,0,44,73,12,...,17,3,8,8,25,6,74,0,45,24
2023-12-01,3,15,18,16,48,18,0,80,87,10,...,27,6,14,6,33,6,75,2,0,15
2024-01-01,4,15,12,19,45,14,0,54,70,9,...,30,3,18,7,33,7,70,0,42,25
2024-02-01,3,20,12,21,36,22,0,76,73,10,...,27,2,19,7,30,6,84,1,0,30


## Export the dataframe


In [73]:
# Export the data to a CSV file
combined_df.to_csv('data/gt_MU_capstone.csv')