In [6]:
import os
import pickle
import pandas as pd
import time
import random
import gtrend
from pytrends.request import TrendReq
from sklearn.decomposition import PCA

# Initialize pytrends request
pytrend = TrendReq(hl='en-US')

# Query parameters
keywords = ["CBDC", "Central Bank Digital Currency"]
name = 'misc'  # Group name

# Start and end dates for the trends data YYYY-MM-DD
start = '2020-01-01'
end = '2021-01-01'

# Geo refers to the geographical location. Examples:
# - 'US' for United States (entire country)
# - 'US-CA' for California, United States (specific state)
# - 'GB' for Great Britain (entire country)
# - 'GB-ENG' for England, Great Britain (specific region)
# - 'IN' for India (entire country)
# - 'RU' for Russia (entire country)
# - '' for worldwide trends (default)
geo = ''  # Example: Change this to 'IN' for India or 'RU' for Russia, or leave empty for worldwide trends

# Category (cat) narrows down the results to a specific topic. Examples:
# - 0 for all categories (default, no filter)
# - 71 for Food & Drink (specific category)
# - 7 for Business & Industrial
# - 174 for Travel
# - 23 for News
# For a full list, you can inspect Google Trends URLs or consult online lists.
cat = 0  # Example: Change this to '7' to filter by the Business & Industrial category

# gprop refers to Google property, e.g., 'news', 'images', 'froogle', 'youtube', or leave empty for web search
gprop = ''


# Directory creation for saving files
pickle_dir = 'pickles'
csv_dir = 'csv'

# Ensure the directories exist
os.makedirs(pickle_dir, exist_ok=True)
os.makedirs(csv_dir, exist_ok=True)

# Define file naming based on query parameters (geo, category, and date range)
pkl_filename = f'{pickle_dir}/{name}_{geo}_{cat}_{start}_{end}.pkl'
csv_filename = f'{csv_dir}/{name}_{geo}_{cat}_{start}_{end}_google_trends.csv'

# Load trends if pickle file exists, otherwise start with an empty list
if os.path.exists(pkl_filename):
    with open(pkl_filename, 'rb') as file:
        trends = pickle.load(file)
else:
    trends = []

# Index to track already queried keywords
start_idx = len(trends)

# Query for each keyword and append new data
for i, keyword in enumerate(keywords[start_idx:], start=start_idx):
    print(f"Fetching trend data for: {keyword}")
    
    # Fetch overlapping trend data for the given keyword
    overlapping = gtrend.get_daily_trend(pytrend, keyword, start, end, geo=geo, cat=cat, gprop=gprop, verbose=True, tz=0)
    
    # Drop the overlap column and append to the list
    trends.append(overlapping.drop(columns='overlap'))

    # Save trends to pickle after each update
    with open(pkl_filename, 'wb') as file:
        pickle.dump(trends, file)
    
    print(f"Saved updated trends to {pkl_filename}")
    
    # Sleep to avoid being blocked by Google Trends API
    time.sleep(random.gammavariate(2.99, 3.99) + 50)

# Concatenate all trend data
data = pd.concat(trends, axis=1)

# Save the combined data as CSV
data.to_csv(csv_filename, index=True)

print(f"Data saved to CSV: {csv_filename}")

# Perform PCA on the trend data
print("Performing PCA on trend data...")
pca = PCA(n_components=1)  # We'll just use the first principal component
pca_result = pca.fit_transform(data)

# Convert PCA result to DataFrame
pca_df = pd.DataFrame(pca_result, index=data.index, columns=[f'{name}_pca_component_1'])

# Save the PCA results to a CSV file
pca_csv_filename = f'{csv_dir}/{name}{geo}{cat}{start}{end}_pca_component_1.csv'
pca_df.to_csv(pca_csv_filename)

print(f"PCA first component saved to CSV: {pca_csv_filename}")

# Optionally, display some information
print(f"Total number of trends collected: {len(trends)}")
print(f"PCA explained variance ratio: {pca.explained_variance_ratio_[0]}")


Fetching trend data for: CBDC
Fetching 'CBDC' for period:2020-04-07 2021-01-01


  ol_temp.iloc[:,:] = None


Fetching 'CBDC' for period:2019-10-21 2020-07-16


  ol_temp.iloc[:,:] = None


Normalize by overlapping period:2020-04-07 2020-07-16
Fetching 'CBDC' for period:2019-05-05 2020-01-29


  ol_temp.iloc[:,:] = None


Normalize by overlapping period:2019-10-21 2020-01-29
Saved updated trends to pickles/misc__0_2020-01-01_2021-01-01.pkl
Fetching trend data for: Central Bank Digital Currency
Fetching 'Central Bank Digital Currency' for period:2020-04-07 2021-01-01


  ol_temp.iloc[:,:] = None


Fetching 'Central Bank Digital Currency' for period:2019-10-21 2020-07-16
The request failed: Google returned a response with code 429
Trying again in 65 seconds.


  ol_temp.iloc[:,:] = None


Normalize by overlapping period:2020-04-07 2020-07-16
Fetching 'Central Bank Digital Currency' for period:2019-05-05 2020-01-29


  ol_temp.iloc[:,:] = None


Normalize by overlapping period:2019-10-21 2020-01-29
Saved updated trends to pickles/misc__0_2020-01-01_2021-01-01.pkl
Data saved to CSV: csv/misc__0_2020-01-01_2021-01-01_google_trends.csv
Performing PCA on trend data...
PCA first component saved to CSV: csv/misc02020-01-012021-01-01_pca_component_1.csv
Total number of trends collected: 2
PCA explained variance ratio: 0.6252808525317468
