## 1. Preprocess Google Trends Data: Top 25 Queries
We read the downloaded csv-files from google trends into a joint dataframe and store it.
First, we do this with the annual data:

In [None]:
import pandas as pd
from pathlib import Path
import numpy as np

# This script reads all the csv files in the annual folder and concatenates them into one csv file
base_path = Path(
    " /mnt/ceph/storage/data-in-progress/data-teaching/theses/thesis-schneg/google_trends/annual/")
result = pd.DataFrame()
csvpath = base_path / "google-files"
for file in csvpath.iterdir():
    year = file.stem.split('-')[1]
    if year < '2023':
        table = pd.read_csv(file)
        table['year'] = int(year)
        result = pd.concat([result, table])

result = result.sort_values(by=['year', 'score'], ascending=False)
print(result)

# check if file already exists
if (base_path / "google_trends_total.csv").exists():
    # delete the file   
    (base_path / "google_trends_total.csv").unlink()

result.to_csv(base_path / "google_trends_total.csv", index=False)

Now we do the same with the monthly google trends data:

In [None]:
from pathlib import Path
import pandas as pd
base_path = Path(
    " /mnt/ceph/storage/data-in-progress/data-teaching/theses/thesis-schneg/google_trends/monthly/")

data = pd.DataFrame()
csvpath = base_path / "google-files"
for path in csvpath.iterdir():
    if path.is_file():
        df = pd.read_csv(path, skiprows=2, nrows=25, header=None)
        df.columns = ['query', 'score']
        df = df.sort_values(by='score', ascending=False)
        df['rank'] = np.arange(1, len(df)+1)
        data = pd.concat([data, df])

# check if file already exists
if (base_path / "google_trends_total.csv").exists():
    # delete the file   
    (base_path / "google_trends_total.csv").unlink()

data.to_csv(base_path / "google_trends_total.csv", index=False)

## 2. Preprocess Google Trends Data: Time Series
We read the time series of selected queries into one datafram and store it.

In [None]:
from pathlib import Path
import pandas as pd

base_path = Path("/mnt/ceph/storage/data-in-progress/data-teaching/theses/thesis-schneg/google_trends/time-series/")

data = pd.DataFrame()
csvpath = base_path / "google-files"
for path in csvpath.iterdir():
    if path.is_file():
        df = pd.read_csv(path, skiprows=3, header=None)
        df.columns = ['time', 'score']
        # the query is the file name, we get it by removing the extension
        df['query'] = path.stem
        # convert str values into float
        if type(df['score'].to_numpy()[0]) is str:
            # convert the <1 values to 0.5
            df['score'] = df['score'].str.replace('<1', '0.5')

            # convert the score to float
            df['score'] = df['score'].astype(float)
        print(f"Query: {path.stem}, Size: {len(df['score'])}, \nScore: {df['score'].to_numpy()}")

        data = pd.concat([data, df])

print(data)

print(data)
# check if file already exists
if (base_path / "google_trends_total.csv").exists():
    # delete the file   
    (base_path / "google_trends_total.csv").unlink()

data.to_csv(base_path / "google_trends_total.csv", index=False)

Query: google, Size: 228, 
Score: [  6   6   6   8   7   7   7   7   7   7   7   8   8   8   8  11   9  10
  18  23  23  21  20  19  21  21  21  21  22  23  24  22  23  25  24  25
  25  26  25  27  27  29  27  29  27  27  28  27  27  28  28  28  28  29
  28  31  35  32  32  33  33  36  36  38  37  36  34  37  37  40  40  41
  41  41  43  43  44  41  39  42  46  46  46  46  46  48  50  53  54  55
  58  55  58  55  59  55  56  53  57  56  58  56  58  59  60  70  72  82
  76  81  81  81  81  86  85  81  91  91  91  89  91  86  90  90  89  87
  88  90  95 100  89  87  91  82  90  87  91  82  80  84  89  86  85  80
  79  73  81  76  72  68  64  66  71  75  68  64  66  67  64  65  65  60
  56  58  64  66  62  60  61  64  60  59  58  52  51  49  57  57  57  51
  54  55  54  54  51  51  49  52  57  57  56  52  53  55  60  68  65  53
  52  55  61  55  51  50  48  50  50  48  47  47  46  45  47  46  45  44
  48  51  52  52  51  52  51  50  54  52  49  48]
Query: free, Size: 228, 
Score: [ 98  93