# Scraping Google Trends data using Institute of Business Administration (IBA), Pakistan keywords

 

In [1]:
# import libraries/modules
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=360, timeout=(40,25))
import json

## Load the keyword dictionary 

In [2]:
df_cat_IBA = pd.read_excel('data/catagory_name_IBA.xlsx')
df_cat_IBA

Unnamed: 0,category_name
0,Economic crisis
1,Crisis
2,Recession
3,Financial crisis
4,Inflation
5,Unemployment
6,BISP
7,ehsaas program
8,USAID
9,Credit


## Get all the keywords and create a list 

In [3]:
iba_category_name = df_cat_IBA["category_name"].values.tolist()
iba_category_name

['Economic crisis',
 'Crisis',
 'Recession',
 'Financial crisis',
 'Inflation',
 'Unemployment',
 'BISP',
 'ehsaas program',
 'USAID',
 'Credit',
 'Loan',
 'Interest',
 'House Loan',
 'Car Loan',
 'Food',
 'Cinema',
 'Cars',
 'Birthday',
 'Travel',
 'Weddings',
 'Fitness',
 'Cigarette',
 'Tourism',
 'Hotels',
 'Fast Food',
 'House for sale',
 'Construction',
 'Investment',
 'Jobs',
 'Agriculture',
 'FMCG',
 'Aviation',
 'Manufacturing',
 'Textile',
 'Economy News',
 'Business News',
 'World News',
 'Politics',
 'Newspapers',
 'mehngai',
 'Real estate',
 'deficit',
 'elections',
 'parliament',
 'taxes',
 'government',
 'budget',
 'economic growth',
 'subsidy',
 'current account',
 'trade',
 'protest',
 'stock market',
 'revenue',
 'LSM',
 'M0',
 'PSB',
 'CPI']

## Remove duplicates from a list

In [4]:
keywords = list(set(iba_category_name))
len(keywords)

58

## Define the chunk size and split the list into chunks

In [5]:
# Define the chunk size
chunk_size = 15

# Split the list into chunks
chunks = [keywords[i:i + chunk_size] for i in range(0, len(keywords), chunk_size)]

len(chunks)

4

## Name each chunk


In [6]:
# Name each chunk as chunk_i starting from 1
for i, chunk in enumerate(chunks, 1):
    globals()[f'chunks{i}'] = chunk

## Define a function fetches the Google Trend data

In [9]:
def fetch_trends_in_batches(chunks,
                            batch_size=1,
                            cat=0, 
                            timeframe='2004-01-01 2024-03-31', 
                            geo='ZA', 
                            gprop=''):
    
    pytrends = TrendReq(hl='en-US', tz=360, timeout=(30, 25))
    
    all_data = []
    num_batches = -(-len(chunks) // batch_size)  # Calculate number of batches
    
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        
        batch_keywords = chunks[start_idx:end_idx]
        
        pytrends.build_payload(batch_keywords, cat=cat, 
                               timeframe=timeframe,
                               geo=geo, 
                               gprop=gprop)
        data = pytrends.interest_over_time()
        
        all_data.append(data)

    combined_data = pd.concat(all_data, axis=1)
    return combined_data

## Store combined dataframes in a dictionary

In [10]:
combined_data_dict = {}

batch_size = 1
for i in range(1, len(chunks) + 1):
    chunks = globals()[f'chunks{i}']
    combined_data_dict[f'combined_data{i}'] = fetch_trends_in_batches(chunks, 
                                                                      batch_size=batch_size, 
                                                                      cat=0, 
                                                                      timeframe='2004-01-01 2024-03-31', 
                                                                      geo='ZA', 
                                                                      gprop='')
len(combined_data_dict)

TooManyRequestsError: The request failed: Google returned a response with code 429

## Combine and stored the dataframes 

In [None]:
# Determine the number of dataframes stored in combined_data_dict
num_dataframes = len(combined_data_dict)

# Initialize an empty list to store dataframes
dataframes = []

# Loop through the keys to access each dataframe
for i in range(1, num_dataframes + 1):
    key = f'combined_data{i}'
    if key in combined_data_dict:
        dataframes.append(combined_data_dict[key])

# Concatenate the dataframes along the date index
combined_df = pd.concat(dataframes, axis=1)

combined_df = combined_df.loc[:, ~combined_df.columns.str.endswith('isPartial')]

# Reset index to make date index
combined_df.reset_index(inplace=True)

# Drop duplicate date columns
combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]

# Set date column as index
combined_df.set_index('date', inplace=True)

combined_df

## Export the dataframe


In [None]:
# Export the data to a CSV file
combined_df.to_csv('data/gt_ZA_IBA.csv')