## 1. Preprocess Google Trends Data: Top 25 Queries
We read the downloaded csv-files from google trends into a joint dataframe and store it.
First, we do this with the annual data:

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

# This script reads all the csv files in the annual folder and concatenates them into one csv file
base_path = Path(
    " /mnt/ceph/storage/data-in-progress/data-teaching/theses/thesis-schneg/google_trends/annual/")
result = pd.DataFrame()
csvpath = base_path / "google-files"
for file in csvpath.iterdir():
    year = file.stem.split('-')[1]
    if year < '2023':
        table = pd.read_csv(file)
        table['year'] = int(year)
        result = pd.concat([result, table])

result = result.sort_values(by=['year', 'score'], ascending=False)
print(result)

# check if file already exists
if (base_path / "google_trends_total.csv").exists():
    # delete the file   
    (base_path / "google_trends_total.csv").unlink()

result.to_csv(base_path / "google_trends_total.csv", index=False)

Now we do the same with the monthly google trends data:

In [None]:
from pathlib import Path
import pandas as pd
base_path = Path(
    " /mnt/ceph/storage/data-in-progress/data-teaching/theses/thesis-schneg/google_trends/monthly/")

data = pd.DataFrame()
csvpath = base_path / "google-files"
for path in csvpath.iterdir():
    if path.is_file():
        df = pd.read_csv(path, skiprows=2, nrows=25, header=None)
        df.columns = ['query', 'score']
        df = df.sort_values(by='score', ascending=False)
        df['rank'] = np.arange(1, len(df)+1)
        data = pd.concat([data, df])

# check if file already exists
if (base_path / "google_trends_total.csv").exists():
    # delete the file   
    (base_path / "google_trends_total.csv").unlink()

data.to_csv(base_path / "google_trends_total.csv", index=False)

## 2. Preprocess Google Trends Data: Time Series
We read the time series of selected queries into one datafram and store it.

In [None]:
from pathlib import Path
import pandas as pd

base_path = Path("/mnt/ceph/storage/data-in-progress/data-teaching/theses/thesis-schneg/google_trends/time-series/")

data = pd.DataFrame()
csvpath = base_path / "google-files"
for path in csvpath.iterdir():
    if path.is_file():
        df = pd.read_csv(path, skiprows=3, header=None)
        df.columns = ['time', 'score']
        # the query is the file name, we get it by removing the extension
        df['query'] = path.stem
        # convert str values into float
        if type(df['score'].to_numpy()[0]) is str:
            # convert the <1 values to 0.5
            df['score'] = df['score'].str.replace('<1', '0.5')

            # convert the score to float
            df['score'] = df['score'].astype(float)
        print(f"Query: {path.stem}, Size: {len(df['score'])}, \nScore: {df['score'].to_numpy()}")

        data = pd.concat([data, df])

print(data)

print(data)
# check if file already exists
if (base_path / "google_trends_total.csv").exists():
    # delete the file   
    (base_path / "google_trends_total.csv").unlink()

data.to_csv(base_path / "google_trends_total.csv", index=False)