<a href="https://colab.research.google.com/github/blackbudge98-cpu/gt-markets/blob/main/Google_Keywords_as_a_predictive_indicator_of_USD_trading_performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Google Keywords as a predictive indicator of USD trading performance**

The Project is exploring how Google Trend KeyWord Data can be used in forward validation to determine the probability of a movement in a trading pair.

Our Control Variable will be USD to determine's its performance on the following trading pairs:

*   USD to Chinese Yuan
*   USD to BTC
*   USD to Oil
*   USD to Gold





In [25]:
#Use of the following libaries will assist in providing the project manager with the data

import yfinance as yf
import pandas as pd
from datetime import date

#The first data set we will want to see is USD over a 10 year period

tickers = ["USD", "USDCNY=X","BTC-USD","CL=F", "GC=F"]

#The definition of tickers will assist in a batch query rather than a singular batch query

df = yf.download(tickers, period="10y", interval="1d")["Close"]

#Rename the columns to be more user friendly, and align with our assumptions

df.rename(columns={"CL=F":"USD to Oil","GC=F":"USD to Gold","BTC-USD": "USD to BTC","USDCNY=X": "USD to Chinese Yuan"},inplace=True)

#print headers for 10 rows to see what the data looks like

print(df.head(10))

  df = yf.download(tickers, period="10y", interval="1d")["Close"]
[*********************100%***********************]  5 of 5 completed

Ticker      USD to BTC  USD to Oil  USD to Gold       USD  USD to Chinese Yuan
Date                                                                          
2015-08-29  229.779999         NaN          NaN       NaN                  NaN
2015-08-30  228.761002         NaN          NaN       NaN                  NaN
2015-08-31  230.056000   49.200001  1131.599976  1.390876               6.3785
2015-09-01  228.121002   45.410000  1138.699951  1.281003               6.3664
2015-09-02  229.283997   46.250000  1132.500000  1.341353               6.3545
2015-09-03  227.182999   46.750000  1123.699951  1.383659               6.3459
2015-09-04  230.298004   46.049999  1120.599976  1.345162               6.3459
2015-09-05  235.018997         NaN          NaN       NaN                  NaN
2015-09-06  239.839996         NaN          NaN       NaN                  NaN
2015-09-07  239.847000         NaN          NaN       NaN               6.3459





In [28]:
from google.colab import drive
from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
import os
from datetime import date

# Authenticate and mount Google Drive
drive.mount('/content/drive')
auth.authenticate_user()
drive_service = build('drive', 'v3')

# Define the folder ID and filename
folder_id = '1tqNeIkQM2IawFLS-NHBzaDl8fsJAo0t_'
today = date.today()
filename = f"financial_data_raw_data_from_yf{today.strftime('%Y-%m-%d')}.csv"
filepath = f"/content/{filename}" # Save locally first

# Save the DataFrame to a temporary local CSV file
df.to_csv(filepath)

# Create a file in the shared drive
file_metadata = {
    'name': filename,
    'parents': [folder_id]
}

media = MediaFileUpload(filepath, mimetype='text/csv')

gfile = drive_service.files().create(
    body=file_metadata,
    media_body=media,
    fields='id'
).execute()

print(f"File ID: {gfile.get('id')}")
print(f"Data exported to shared drive folder: {folder_id}")

# Clean up the local file
os.remove(filepath)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File ID: 1_-jK0nuGdsQOsZ03lKsePRL_d24gic21
Data exported to shared drive folder: 1tqNeIkQM2IawFLS-NHBzaDl8fsJAo0t_


In [27]:
#Next step is to obtain information on the datatable, and apply pre-processing steps

df.info()

#understand how many numbers of rows are in the dataset
print('\n')
num_rows = len(df)
print(f"Number of rows: {num_rows}")

#the next step is to export the dataset as a csv file to enable a view of the data
from google.colab import drive
import os
drive.mount('/content/drive')

today = date.today()
filename = f"financial_data_raw_data_from_yf{today.strftime('%Y-%m-%d')}.csv"
directory = "/content/drive/Shared drives/1tqNeIkQM2IawFLS-NHBzaDl8fsJAo0t_"
filepath = f"{directory}/{filename}" # Construct the full path

# Create the directory if it doesn't exist - Note: This may not work for shared drives. Ensure the folder exists manually.
# os.makedirs(directory, exist_ok=True) # Commenting out as it might not work for shared drives.

df.to_csv(filepath)

print (f"Data exported to {filepath}")

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3654 entries, 2015-08-29 to 2025-08-29
Freq: D
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   USD to BTC           3654 non-null   float64
 1   USD to Oil           2515 non-null   float64
 2   USD to Gold          2514 non-null   float64
 3   USD                  2515 non-null   float64
 4   USD to Chinese Yuan  2603 non-null   float64
dtypes: float64(5)
memory usage: 171.3 KB


Number of rows: 3654
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


OSError: Cannot save file into a non-existent directory: '/content/drive/Shared drives/1tqNeIkQM2IawFLS-NHBzaDl8fsJAo0t_'

In [None]:
print("Blank values in the raw database")
print('\n')

#Identify the number of rows are blank
print(df.isna().sum())

print('\n')
blank_rate = (df.isna().sum() / num_rows) * 100
print("Blank Rate (%):")
print(blank_rate.round(2))

In [None]:
#After Identifying the blank rate in the original dataframe pre-processing needs to be applied
df_for_pre_processing = df.copy()
df_for_pre_processing['Day of Week'] = df_for_pre_processing.index.day_name()
print (df_for_pre_processing.head(10))

#We know Bitcoin trades all the time, but we want to see if there is any other blanks in the dataset
print('\n')
print("Blank values in the pre-processed database")
print('\n')
missing_values_per_day_of_week = df_for_pre_processing.groupby('Day of Week').apply(lambda g: g.isna().sum().sum())
print(missing_values_per_day_of_week)

In [None]:
#Therefore in the pre-processing dataset the decision will be made to drop both Saturday and Sunday
df_weekday = df_for_pre_processing.drop(df_for_pre_processing[(df_for_pre_processing['Day of Week'] == 'Saturday') | (df_for_pre_processing['Day of Week'] == 'Sunday')].index)
df_weekday_reordered = df_weekday[['Day of Week', 'USD', 'USD to Chinese Yuan', 'USD to BTC', 'USD to Oil', 'USD to Gold']]
df_weekday_reordered.head(10)

In [None]:
#Where data is missing the assumption will be to replace in the df_weekday dataframe with the previous value in the dataset
df_weekday_usd = df_weekday.drop(columns=['USD to BTC','USD to Oil','USD to Gold','USD to Chinese Yuan'])
df_weekday_usd_reordered = df_weekday_usd[['Day of Week', 'USD']]
df_weekday_usd_reordered.head(10)

In [None]:
#Next step is for any NaN is to show the dates
df_weekday_usd_reordered.isna().sum()



In [None]:
# Show dates where 'USD' is NaN
dates_with_missing_usd = df_weekday_usd_reordered[df_weekday_usd_reordered['USD'].isna()].index
print("Dates with missing 'USD' values:")
print(dates_with_missing_usd)

There are holidays in the United States which are not a set date but rather a day near of. For simplicity of the dataset it will be easier to use the previous close value.

In [None]:
# Where there is a NaaN going to use the previous day close to populate the value
df_weekday_usd_reordered['USD'] = df_weekday_usd_reordered['USD'].ffill()
df_weekday_usd_reordered.head(10)

In [None]:
#Now we add a daily change amount, and a percentage daily change to the dataset
df_weekday_usd_reordered['Daily Change'] = df_weekday_usd_reordered['USD'].diff()
df_weekday_usd_reordered['% Daily Change'] = df_weekday_usd_reordered['Daily Change'] / df_weekday_usd_reordered['USD']

# Fill the initial NaN values with 0
df_weekday_usd_reordered['Daily Change'].fillna(0, inplace=True)
df_weekday_usd_reordered['% Daily Change'].fillna(0, inplace=True)

df_weekday_usd_reordered.head(10)

In [29]:
#Export the DF Weekday USD Performance as a CSV File with the First Date - Last Date as the file name
from google.colab import drive
from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaFileUpload
import os
from datetime import date

# Authenticate and mount Google Drive (if not already mounted)
drive.mount('/content/drive')
auth.authenticate_user()
drive_service = build('drive', 'v3')

# Define the folder ID and filename
folder_id = '1wOqTixtRA5n5uHN1ptfIbQm6suTrLDxA'
today = date.today()
filename = f"financial_data_pre_processed_data_from_yf_{today.strftime('%Y-%m-%d')}.csv" # Changed filename slightly to distinguish
filepath = f"/content/{filename}" # Save locally first

# Save the DataFrame to a temporary local CSV file
df_weekday_usd_reordered.to_csv(filepath)

# Create a file in the shared drive
file_metadata = {
    'name': filename,
    'parents': [folder_id]
}

media = MediaFileUpload(filepath, mimetype='text/csv')

gfile = drive_service.files().create(
    body=file_metadata,
    media_body=media,
    fields='id'
).execute()

print(f"File ID: {gfile.get('id')}")
print(f"Data exported to shared drive folder: {folder_id}")

# Clean up the local file
os.remove(filepath)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File ID: 1aEAPWO1KSiQvzmt6rB_beotoO2JJtiVH
Data exported to shared drive folder: 1wOqTixtRA5n5uHN1ptfIbQm6suTrLDxA
