In [148]:
# https://platform.olimpiada-ai.ro/problems/55

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import datetime

In [149]:
train = pd.read_csv("/kaggle/input/gold-price/train.csv")
test = pd.read_csv("/kaggle/input/gold-price/test.csv")

train.shape, test.shape

((2975, 48), (744, 47))

In [150]:
train.head(2)

Unnamed: 0,ID,date,sp500 open,sp500 high,sp500 low,sp500 close,sp500 volume,sp500 high-low,nasdaq open,nasdaq high,...,palladium high,palladium low,palladium close,palladium volume,palladium high-low,gold open,gold high,gold low,gold close,gold volume
0,R00210,2010-11-11,121.05,121.82,120.68,121.64,157659616.0,1.14,52.9,53.48,...,71.48,69.45,71.07,279282.0,2.03,137.62,137.75,136.45,137.66,15380502.0
1,R03365,2023-05-26,415.33,420.77,415.25,420.02,93829975.0,5.52,340.76,349.245,...,133.61,131.37,131.37,20046.0,2.24,181.01,181.3,180.09,180.92,5823674.0


In [151]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2975 entries, 0 to 2974
Data columns (total 48 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  2975 non-null   object 
 1   date                2975 non-null   object 
 2   sp500 open          2975 non-null   float64
 3   sp500 high          2975 non-null   float64
 4   sp500 low           2975 non-null   float64
 5   sp500 close         2975 non-null   float64
 6   sp500 volume        2975 non-null   float64
 7   sp500 high-low      2975 non-null   float64
 8   nasdaq open         2975 non-null   float64
 9   nasdaq high         2975 non-null   float64
 10  nasdaq low          2975 non-null   float64
 11  nasdaq close        2975 non-null   float64
 12  nasdaq volume       2975 non-null   float64
 13  nasdaq high-low     2975 non-null   float64
 14  us_rates_%          94 non-null     float64
 15  CPI                 94 non-null     float64
 16  usd_ch

In [152]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 47 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  744 non-null    object 
 1   date                744 non-null    object 
 2   sp500 open          744 non-null    float64
 3   sp500 high          744 non-null    float64
 4   sp500 low           744 non-null    float64
 5   sp500 close         744 non-null    float64
 6   sp500 volume        744 non-null    float64
 7   sp500 high-low      744 non-null    float64
 8   nasdaq open         744 non-null    float64
 9   nasdaq high         744 non-null    float64
 10  nasdaq low          744 non-null    float64
 11  nasdaq close        744 non-null    float64
 12  nasdaq volume       744 non-null    float64
 13  nasdaq high-low     744 non-null    float64
 14  us_rates_%          23 non-null     float64
 15  CPI                 23 non-null     float64
 16  usd_chf 

In [153]:
from sklearn.impute import SimpleImputer

def process_df(df):
    df.drop(columns=['GDP', 'us_rates_%', 'CPI'], inplace=True)
    df['date'] = pd.to_datetime(df['date']).map(lambda x: (x-datetime.datetime(2010, 1, 1)).total_seconds())
    return df

train = process_df(train)
test = process_df(test)

imputer = SimpleImputer(strategy='median')

cols_to_impute = ['usd_chf', 'eur_usd']

train[cols_to_impute] = imputer.fit_transform(train[cols_to_impute])
test[cols_to_impute] = imputer.transform(test[cols_to_impute])

In [154]:
from sklearn.model_selection import train_test_split
from catboost import Pool

features = [c for c in train.columns if c not in ['ID', 'gold close']]
target_col = 'gold close'

X, y = train[features], train[target_col]
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

train_pool = Pool(X_train, y_train)
valid_pool = Pool(X_valid, y_valid)
full_pool = Pool(X, y)

In [155]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)

In [156]:
# from catboost import CatBoostRegressor

# params = {
#     'iterations': 10000,
#     'loss_function': 'RMSE',
#     'eval_metric': 'RMSE',
#     'metric_period': 500,
#     'max_depth': 6,
#     'random_state': 42
# }

# model = CatBoostRegressor(**params)

# model.fit(train_pool, eval_set=valid_pool)

In [157]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_valid)

score = mean_squared_error(y_pred, y_valid, squared=False)

print(f'Score: {score:.5f}')

Score: 0.27528


In [158]:
model.fit(X, y)

In [159]:
y_pred = model.predict(X_test)

subm = pd.DataFrame({
    'ID': test['ID'],
    'gold close': y_pred
})

subm.to_csv("submission.csv", index=False)

subm.head()

Unnamed: 0,ID,gold close
0,R00904,132.360068
1,R03254,165.515499
2,R01784,116.861114
3,R01972,121.182856
4,R02105,123.585974
