In [1]:
# This notebook is meant for use in the great lakes computing environment
# in order to maximize the amount of data we can extract for our modeling task 

In [2]:
import pandas as pd   
import numpy as np
import pytrends
from pytrends.request import TrendReq
import gtab
import matplotlib.pyplot as plt
import os

In [3]:
def get_single_keyword_trend_data_gtab(keyword, region='US', time_period='2020-01-01 2024-10-11'):
    """
    Query Google Trends data using GTAB for a single keyword, region, and time period.
    
    Args:
        keyword (str): The keyword to search.
        region (str): Region code (default is 'US').
        time_period (str): Timeframe for the data (default is '2020-01-01 2024-10-11').
    
    Returns:
        pd.DataFrame: Google Trends data for the keyword with Date and Max Ratio (Interest) columns.
    """
    # Initialize GTAB object
    t = gtab.GTAB()

    try:
        # Ensure keyword is a non-empty string
        if not keyword or not isinstance(keyword, str):
            raise ValueError("Invalid keyword provided.")
        
        print(f"Querying keyword: {keyword} in region: {region} for the period {time_period}")
        
        # Query Google Trends using GTAB
        t.set_options(pytrends_config={"timeframe": time_period, 'geo': region})
        trend_data = t.new_query(keyword)
        
      # Check if data is empty
        if trend_data.empty:
            print(f"No data found for keyword: {keyword}")
            return None
        
        # Rename 'max_ratio' to the keyword
        trend_data = trend_data[['max_ratio']].rename(columns={'max_ratio': keyword})
        
        # Reset the index to move 'date' from the index to a column
        trend_data = trend_data.reset_index()

        return trend_data

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [4]:
def load_keywords_from_file(file_path):
    """
    Load keywords from a text file with comma-separated values in one line.

    Args:
        file_path (str): Path to the file containing keywords.

    Returns:
        list: A list of cleaned keywords.
    """
    with open(file_path, 'r') as file:
        # Read the first line and split by comma
        keywords = file.readline().strip().split(',')
    return [kw.strip() for kw in keywords if kw.strip()]  # Clean whitespace and filter out empty strings


In [5]:
def process_keywords_for_file(file_path, region, time_period):
    """
    Process Google Trends data for keywords in a single file and save it to a DataFrame.

    Args:
        file_path (str): Path to the file containing keywords.
        region (str): Region code.
        time_period (str): Timeframe for the data.

    Returns:
        pd.DataFrame: Combined trend data for all keywords in the file.
    """
    keywords = load_keywords_from_file(file_path)
    combined_trend_data = pd.DataFrame()

    # Loop through each keyword and get trend data
    for keyword in keywords:
        trend_data = get_single_keyword_trend_data_gtab(keyword, region, time_period)

        if trend_data is not None:
            if combined_trend_data.empty:
                combined_trend_data = trend_data
            else:
                combined_trend_data = pd.merge(combined_trend_data, trend_data, on='date', how='outer')
    
    return combined_trend_data

In [None]:
if __name__ == "__main__":
    # Directory containing keyword text files
    #folder_path = os.getcwd() + '/text_files'
    folder_path = '/home/jshumway/text_files'
    region = 'US'
    time_period = '2020-01-01 2024-10-11'  

    # Iterate through each text file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt") or filename.endswith(".rtf"):  # Process only text files
            file_path = os.path.join(folder_path, filename)
            
            # Process the file and get the combined trend data
            combined_trend_data = process_keywords_for_file(file_path, region, time_period)
            
            # Define the output CSV file path
            output_csv = os.path.join('/home/jshumway/csv_files', f"{os.path.splitext(filename)[0]}_trend_data.csv")
            
            # Save combined trend data to CSV
            combined_trend_data.to_csv(output_csv, index=False)

Using directory '/Users/Jenny/MADS/Capstone/myenv/lib/python3.12/site-packages/gtab'
Active anchorbank changed to: google_anchorbank_geo=_timeframe=2019-01-01 2020-08-01.tsv

Querying keyword: Basketballs in region: US for the period 2020-01-01 2024-10-11
Using /Users/Jenny/MADS/Capstone/myenv/lib/python3.12/site-packages/gtab/output/google_anchorbanks/google_anchorbank_geo=_timeframe=2019-01-01 2020-08-01.tsv
New query 'Basketballs'
New query calibrated!
Using directory '/Users/Jenny/MADS/Capstone/myenv/lib/python3.12/site-packages/gtab'
Active anchorbank changed to: google_anchorbank_geo=_timeframe=2019-01-01 2020-08-01.tsv

Querying keyword: Soccer balls in region: US for the period 2020-01-01 2024-10-11
Using /Users/Jenny/MADS/Capstone/myenv/lib/python3.12/site-packages/gtab/output/google_anchorbanks/google_anchorbank_geo=_timeframe=2019-01-01 2020-08-01.tsv
New query 'Soccer balls'
New query calibrated!
Using directory '/Users/Jenny/MADS/Capstone/myenv/lib/python3.12/site-packages