# Keyword selection for Google Trends Data

1) University of British Columbia (UBC), Canada: Capstone Project
   - Title: Nowcasting Macroeconomic Indicators using Google Trends, 2022 
   - Keywords:
      - GDP category Keywords (141)
      - GDP keywords data from related queries (243)
      - GDP keywords data from related topics  (213)

2) Organisation for Economic Co-operation and Development (OECD): Working Papers, 2020

   - Title: OECD Economics Department Working Papers

   - Keywords:
     - GDP OECD Keywords (134)
    
3) Institute of Business Administration (IBA), Karachi, Pakistan:  Project Report
   - Title: Now casting GDP Growth and Forecasting Inflation of Pakistan
   - Keywords:
     - GDP IBA Keywords (58)


In [1]:
# import libraries/modules
import pandas as pd
from pytrends.request import TrendReq
import time
import json
startTime = time.time()

## Keywords from University of British Columbia (UBC): Capstone

### GDP category Keywords

In [83]:
capstone_cat = pd.read_csv('data/gdp_category_ts.csv', index_col=0)
capstone_cat.head()

Unnamed: 0_level_0,569,23,47,815,170,249,71,276,634,250,...,53,342,1159,1214,670,12,566,672,673,49
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004-01-01,74,89,90,94,90,87,76,57,60,91,...,100,92,95,77,68,100,87,74,89,88
2004-02-01,70,86,88,93,88,87,71,58,51,98,...,96,99,95,90,78,99,96,78,81,100
2004-03-01,74,83,95,100,87,87,73,61,53,99,...,99,100,99,93,100,95,100,82,100,97
2004-04-01,72,82,92,96,89,82,70,55,53,95,...,90,95,89,98,97,94,87,78,81,89
2004-05-01,74,82,94,96,97,92,66,53,49,94,...,89,90,83,89,87,87,94,76,63,80


### Get column names

In [84]:
capstone_cat_col = capstone_cat.columns
capstone_cat_col

Index(['569', '23', '47', '815', '170', '249', '71', '276', '634', '250',
       ...
       '53', '342', '1159', '1214', '670', '12', '566', '672', '673', '49'],
      dtype='object', length=141)

### Convert to Dataframe 

In [17]:
# convert cat_name to dataframe
df_capstone_cat = pd.DataFrame(capstone_cat_col, columns=['category'])
df_capstone_cat

Unnamed: 0,category
0,569
1,23
2,47
3,815
4,170
...,...
136,12
137,566
138,672
139,673


### Rename the column name

In [20]:
# rename the column
df_capstone_cat = df_capstone_cat.rename(columns={'category':'category_id'})
df_capstone_cat

Unnamed: 0,category_id
0,569
1,23
2,47
3,815
4,170
...,...
136,12
137,566
138,672
139,673


### Load Google Trends Category name from serpapi

- From: https://serpapi.com/google-trends-categories

In [86]:
gt_cat = pd.read_excel('data/GT_cat1.xlsx')
gt_cat.head()
gt_cat.shape

(1133, 2)

### Rename column name 

In [22]:
# rename the column 
gt_cat = gt_cat.rename(columns={'cat':'category_id', 'Category':'category_name'})
gt_cat

Unnamed: 0,category_id,category_name
0,0,All categories
1,3,Arts & Entertainment
2,184,Celebrities & Entertainment News
3,316,Comics & Animation
4,1104,Animated Films
...,...,...
1128,1008,Theme Parks
1129,1010,Travel Agencies & Services
1130,1392,Tourist Boards & Visitor Centers
1131,1019,Vacation Offers


### Get the category name

In [23]:
# Convert 'category_id' to int in both dataframes
df_capstone_cat['category_id'] = df_capstone_cat['category_id'].astype(int)
gt_cat['category_id'] = gt_cat['category_id'].astype(int)

# merge the dataframes
merged_df = df_capstone_cat.merge(gt_cat, on='category_id', how='left')

# Replace the 'category_name' in  dataframe
df_capstone_cat['category_name'] = merged_df['category_name']

df_capstone_cat

Unnamed: 0,category_id,category_name
0,569,Events & Listings
1,23,Performing Arts
2,47,Autos & Vehicles
3,815,Vehicle Brands
4,170,Vehicle Licensing & Registration
...,...,...
136,12,Business & Industrial
137,566,Textiles & Nonwovens
138,672,Coatings & Adhesives
139,673,Dyes & Pigments


### Check the missing values

In [24]:
missing_values_sum = df_capstone_cat.isnull().sum().sum()
print(missing_values_sum)

0


### Export the data to csv

In [15]:
# export the data to csv
df_capstone_cat.to_csv('data/category_name_capstone.csv')

### GDP keywords data from related queries 

In [96]:

file1 = open('data/gdp_cat_queries_dict.json')
cat_queries_dict = json.load(file1)
cat_queries_dict.values()

dict_values([['cineplex', 'cinema'], ['dance', 'theatre'], ['honda', 'ford'], ['ford', 'honda'], ['license', 'driving'], ['blue cross', 'desjardins'], ['pizza pizza', 'pizza'], ['restaurant', 'restaurants'], ['clinic', 'dr'], ['hospital', 'hopital'], ['fire', 'emergency'], ['depression', 'anxiety'], ['home depot', 'ikea'], ['mls', 'real estate'], ['remax', 'real real'], ['kijiji', 'walmart'], ['hotel', 'air canada'], ['hotel', 'hotels'], ['google', 'apple'], ['shoes', 'boots'], ['xbox', 'videotron'], ['or', 'bijoux'], ['acces d', 'cra'], ['staples', 'desk'], ['bankruptcy', 'hollywoodpq'], ['credit', 'mortgage'], ['business', 'pret'], ['student', 'tuition'], ['mortgage', 'calculator'], ['lease', 'calculator'], ['farm', 'plant'], ['forest', 'forestry'], ['salmon', 'fish'], ['circulaire', 'iga'], ['cigarettes', 'smoke'], ['shoes', 'boots'], ['printing', 'vistaprint'], ['gas', 'gas prices'], ['bottle depot', 'plastic'], ['pharmacy', 'side effects'], ['hp', 'kingston'], ['generator', 'valve

In [97]:
cat_queries_dict

{'569': ['cineplex', 'cinema'],
 '23': ['dance', 'theatre'],
 '47': ['honda', 'ford'],
 '815': ['ford', 'honda'],
 '170': ['license', 'driving'],
 '249': ['blue cross', 'desjardins'],
 '71': ['pizza pizza', 'pizza'],
 '276': ['restaurant', 'restaurants'],
 '634': ['clinic', 'dr'],
 '250': ['hospital', 'hopital'],
 '168': ['fire', 'emergency'],
 '437': ['depression', 'anxiety'],
 '11': ['home depot', 'ikea'],
 '29': ['mls', 'real estate'],
 '96': ['remax', 'real real'],
 '18': ['kijiji', 'walmart'],
 '67': ['hotel', 'air canada'],
 '179': ['hotel', 'hotels'],
 '5': ['google', 'apple'],
 '68': ['shoes', 'boots'],
 '78': ['xbox', 'videotron'],
 '696': ['or', 'bijoux'],
 '329': ['acces d', 'cra'],
 '95': ['staples', 'desk'],
 '423': ['bankruptcy', 'hollywoodpq'],
 '279': ['credit', 'mortgage'],
 '1160': ['business', 'pret'],
 '813': ['student', 'tuition'],
 '466': ['mortgage', 'calculator'],
 '468': ['lease', 'calculator'],
 '46': ['farm', 'plant'],
 '750': ['forest', 'forestry'],
 '747': 

### Get all the keywords and create a list 

In [98]:
keywords_queries = [keyword for sublist in cat_queries_dict.values() for keyword in sublist]
len(keywords_queries)

282

### Remove duplicates from a list

In [99]:
keywords_queries = list(set(keywords_queries))
len(keywords_queries)

243

In [100]:
keywords_queries

['ship',
 'cover letter',
 'air canada',
 'godaddy',
 'project',
 'pharmacy',
 'gas',
 'logistics',
 'insurance',
 'homemade',
 'architects',
 'election results',
 'lawyer',
 'boat',
 'desjardins',
 'hotels',
 'hair removal',
 'finance',
 'netflix',
 'mortgage',
 'security',
 'rail',
 'cruise',
 'piscine',
 'covid',
 'staples',
 'glue',
 'or',
 'theatre',
 'job',
 'librairie',
 'potash',
 'cegep',
 'fashion',
 'domain',
 'birthday',
 'vet',
 'customs',
 'hospital',
 'fitness',
 'dashboard',
 'plastic',
 'plant',
 'ford',
 'doors',
 'oscar',
 'crm',
 'hockey',
 'dog',
 'couleur',
 'beer',
 'fabric',
 'trucks',
 'smoke',
 'architecture',
 'gift',
 'apple',
 'ordre',
 'air france',
 'hp',
 'dr',
 'vacations',
 'furniture',
 'yoga',
 'forest',
 'ttc',
 'anxiety',
 'mail',
 'madden',
 'fire',
 'trailers',
 'fish',
 'railway',
 'cra login',
 'hydro',
 'clinic',
 'toronto',
 'election',
 'tax',
 'cra',
 'windows',
 'home depot',
 'tractor',
 'car wash',
 'vacuum',
 'stock',
 'fifa',
 'chomage

### GDP keywords data from related topics 

In [101]:
file2 = open('data/gdp_cat_topics_dict.json')
cat_topics_dict = json.load(file2)
cat_topics_dict.values()

dict_values([['Cineplex Entertainment', 'Film'], ['Dance', 'Theater'], ['Car', 'Canada'], ['Canada', 'Honda'], ['Driving', 'License'], ['Health', 'Insurance'], ['Recipe', 'Restaurant'], ['Restaurant', 'Menu'], ['Clinic', 'Walk-in clinic'], ['Hospital', 'Clinic'], ['Fire', 'Building insulation'], ['Disease', 'Anxiety'], ['Canada', 'The Home Depot'], ['Sales', 'House'], ['RE/MAX', 'Real Estate'], ['Canada', 'Kijiji'], ['Hotel', 'Flight'], ['Hotel', 'Toronto'], ['Download', 'Canada'], ['Canada', 'Shoes'], ['Xbox', 'Canada'], ['Jewellery', 'Volkswagen Group'], ['Canada Revenue Agency', 'Canada Customs and Revenue Agency'], ['Staples Canada', 'Staples'], ['Bankruptcy', 'HollywoodPQ'], ['Credit', 'Mortgage loan'], ['Business', 'Account'], ['Student', 'Scholarship'], ['Mortgage loan', 'Calculator'], ['Car', 'Lease'], ['Plant', 'Tree'], ['Forest', 'Tree'], ['Fish', 'Salmon'], ['Circulaire', 'Grocery store'], ['Cigarette', 'Cigars & Cigarillos'], ['Shoes', 'Boot'], ['Printing', 'Cimpress'], ['P

In [103]:
len(cat_topics_dict)

141

In [102]:
cat_topics_dict

{'569': ['Cineplex Entertainment', 'Film'],
 '23': ['Dance', 'Theater'],
 '47': ['Car', 'Canada'],
 '815': ['Canada', 'Honda'],
 '170': ['Driving', 'License'],
 '249': ['Health', 'Insurance'],
 '71': ['Recipe', 'Restaurant'],
 '276': ['Restaurant', 'Menu'],
 '634': ['Clinic', 'Walk-in clinic'],
 '250': ['Hospital', 'Clinic'],
 '168': ['Fire', 'Building insulation'],
 '437': ['Disease', 'Anxiety'],
 '11': ['Canada', 'The Home Depot'],
 '29': ['Sales', 'House'],
 '96': ['RE/MAX', 'Real Estate'],
 '18': ['Canada', 'Kijiji'],
 '67': ['Hotel', 'Flight'],
 '179': ['Hotel', 'Toronto'],
 '5': ['Download', 'Canada'],
 '68': ['Canada', 'Shoes'],
 '78': ['Xbox', 'Canada'],
 '696': ['Jewellery', 'Volkswagen Group'],
 '329': ['Canada Revenue Agency', 'Canada Customs and Revenue Agency'],
 '95': ['Staples Canada', 'Staples'],
 '423': ['Bankruptcy', 'HollywoodPQ'],
 '279': ['Credit', 'Mortgage loan'],
 '1160': ['Business', 'Account'],
 '813': ['Student', 'Scholarship'],
 '466': ['Mortgage loan', 'Cal

### Get all the keywords and create a list 

In [104]:
keywords_topics = [keyword for sublist in cat_topics_dict.values() for keyword in sublist]
len(keywords_topics)

282

### Remove duplicates from a list

In [106]:
keywords_topics = list(set(keywords_topics))
len(keywords_topics)

213

In [107]:
keywords_topics

['Canada Revenue Agency',
 'Niagara Falls',
 'Plant',
 'Unemployment',
 'Paint',
 'Book',
 'The Globe and Mail',
 'Mortgage loan',
 'License',
 'Theater',
 'Election',
 'Furniture',
 'Gasoline',
 'Lawyer',
 'Boat',
 'Shopping mall',
 'Forest',
 'Tax',
 'Fertilizer',
 'Anxiety',
 'Advertising',
 'Cigars & Cigarillos',
 'Kijiji',
 'FIFA',
 'Hospital',
 'Textile',
 'Walmart',
 'Carpet',
 'Facebook',
 'General contractor',
 'Ferry',
 'GitHub',
 'Payment',
 'Trailer',
 'Java',
 'Coronavirus disease 2019',
 'Bank',
 'Microsoft Windows',
 'Scholarship',
 'Bidding',
 'Newspaper',
 'Business',
 'Global Positioning System',
 'Toronto Transit Commission',
 'ttc',
 'Liquor Control Board of Ontario',
 'Domain name',
 'Credit',
 'SQL',
 'Toronto',
 'Adhesive',
 'Recipe',
 'Insurance',
 'Expedia',
 'Shaving',
 'Air conditioning',
 'Clinic',
 'Food',
 'Ship',
 'IKEA',
 'Flooring',
 'Jean Coutu Group',
 'Trade fair',
 'Real Estate',
 'Theatre',
 'Train',
 'Valve',
 'Employment',
 'Dashboard',
 'Walmart

## Keywords from Organisation for Economic Co-operation and Development (OECD)

In [108]:
oecd_key = pd.read_csv('data/OECD_keyword.csv')
oecd_key.head()

Unnamed: 0,category_name
0,Economic crisis
1,Crisis
2,Recession
3,Financial crisis
4,Krach


In [109]:
oecd_key.shape

(134, 1)

In [110]:
oecd_keywords = oecd_key["category_name"].values.tolist()
oecd_keywords

['Economic crisis',
 'Crisis',
 'Recession',
 'Financial crisis',
 'Krach',
 'Unemployment',
 'Unemployment benefits',
 'Welfare & Unemployment',
 'Food & Drink',
 'GPS & Navigation ',
 'Performing Arts             ',
 'Luggage topic',
 'Vehicle',
 'Brands',
 'Birthday',
 'Travel',
 'Energy & Utilities',
 'Vehicle Shopping',
 'Tobacco Products',
 'Health',
 'Pharmacy',
 'Carpooling & Ridesharing',
 'Sports',
 'Animal Products & Services',
 'Fitness',
 'Weddings',
 'Car',
 'Rental & Taxi Services',
 'Autos & Vehicles',
 'Tourist Destinations',
 'Home & Garden',
 'Events & Listings',
 'Grocery & Food Retailers',
 'Vehicle Licensing & Registration',
 'Timeshares & Vacation Properties',
 'Home',
 'Appliances',
 'Mass Merchants & Department Stores',
 'Car Electronics',
 'Fashion & Style',
 'Trucks & SUVs',
 'Home Furnishings',
 'Footwear',
 'Cruises & Charters',
 'Hotels & Accommodations',
 'Luggage & Travel',
 'Accessories',
 'Fast Food',
 'Book Retailers',
 'Veterinarians',
 'Spas & Beaut

In [111]:
len(oecd_keywords)

134

## Keywords from Institute of Business Administration (IBA), Karachi

In [87]:
df_cat_IBA = pd.read_excel('data/catagory_name_IBA.xlsx')
df_cat_IBA

Unnamed: 0,category_name
0,Economic crisis
1,Crisis
2,Recession
3,Financial crisis
4,Inflation
5,Unemployment
6,BISP
7,ehsaas program
8,USAID
9,Credit


In [88]:
iba_category_name = df_cat_IBA["category_name"].values.tolist()
iba_category_name

['Economic crisis',
 'Crisis',
 'Recession',
 'Financial crisis',
 'Inflation',
 'Unemployment',
 'BISP',
 'ehsaas program',
 'USAID',
 'Credit',
 'Loan',
 'Interest',
 'House Loan',
 'Car Loan',
 'Food',
 'Cinema',
 'Cars',
 'Birthday',
 'Travel',
 'Weddings',
 'Fitness',
 'Cigarette',
 'Tourism',
 'Hotels',
 'Fast Food',
 'House for sale',
 'Construction',
 'Investment',
 'Jobs',
 'Agriculture',
 'FMCG',
 'Aviation',
 'Manufacturing',
 'Textile',
 'Economy News',
 'Business News',
 'World News',
 'Politics',
 'Newspapers',
 'mehngai',
 'Real estate',
 'deficit',
 'elections',
 'parliament',
 'taxes',
 'government',
 'budget',
 'economic growth',
 'subsidy',
 'current account',
 'trade',
 'protest',
 'stock market',
 'revenue',
 'LSM',
 'M0',
 'PSB',
 'CPI']

In [89]:
keywords = iba_category_name
len(keywords)

58