# Setup

In [70]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [71]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np

In [72]:
data_dir='/public_dataset/train/price_data'
files = [file for file in os.listdir(data_dir) if file.endswith('.csv')]
print(files)
coin_list = [os.path.splitext(file)[0] for file in files]
print(coin_list) #List of coins to make predictions for.

['DAI.csv', 'CHR.csv', 'KIN.csv', 'CHAT.csv', 'DIA.csv', 'AUC.csv', 'DX.csv', 'KMD.csv', 'DMG.csv', 'CKB.csv', 'ETP.csv', 'DOTUP.csv', 'GBP.csv', 'BZRX.csv', 'ENJ.csv', 'FUEL.csv', 'CELR.csv', 'LEND.csv', 'BCN.csv', 'CGLD.csv', 'HSC.csv', 'BNT.csv', 'AGI.csv', 'SWAP.csv', 'OCEAN.csv', 'RSR.csv', 'ZEN.csv', 'SFG.csv', 'XSR.csv', 'ONL.csv', 'TFUEL.csv', 'UTK.csv', 'WXT.csv', 'MBL.csv', 'QUN.csv', 'TNT.csv', 'YFII.csv', 'YEE.csv', 'NEST.csv', 'NPXS.csv']
['DAI', 'CHR', 'KIN', 'CHAT', 'DIA', 'AUC', 'DX', 'KMD', 'DMG', 'CKB', 'ETP', 'DOTUP', 'GBP', 'BZRX', 'ENJ', 'FUEL', 'CELR', 'LEND', 'BCN', 'CGLD', 'HSC', 'BNT', 'AGI', 'SWAP', 'OCEAN', 'RSR', 'ZEN', 'SFG', 'XSR', 'ONL', 'TFUEL', 'UTK', 'WXT', 'MBL', 'QUN', 'TNT', 'YFII', 'YEE', 'NEST', 'NPXS']


In [91]:
data_dir='/train_price_data'   # Path to the file with train data folder which has Kevin's (<3) appended sentiment data
test_dir='/test_price_data'    # Path to the file with test  data folder which has Kevin's (<3) appended sentiment data

In [74]:
len(coin_list)

40

# Generate Dataset:

In [75]:
#From the given dataset and sentiment data, make a pandas dataframe.
def make_dataset(df1,df2,coin):
  full_range = pd.date_range(start=min(df1['datetime'].min(),df2['datetime'].min()), end=max(df1['datetime'].max(),df2['datetime'].max()), freq='D')
  full_df = pd.DataFrame(full_range, columns=['datetime'])
  full_df = full_df.astype({col: 'float64' for col in full_df.select_dtypes(include=['float']).columns})
  full_df['to_pred'] = full_df['datetime'].isin(df2['datetime']).astype(int)
  full_df['train_gap'] = 1 - full_df['datetime'].isin(df1['datetime']).astype(int)

  cols = list(df1.columns)
  cols.remove('datetime')
  temp_df = pd.concat([df1,df2], axis=0)
  temp_df = temp_df.sort_values(by='datetime')
  temp_df.reset_index(drop=True, inplace=True)
  df0 = pd.merge(full_df, temp_df, on='datetime', how='outer')

  window = 14
  lagged_cols = {}

  # Create lagged columns for each column in 'cols'
  for column in cols:
      for i in range(1, window + 1):
          lagged_cols[f'{column}_lag_{i}'] = df0[f'{column}'].shift(i)

  # Use pd.concat to add all the lagged columns at once
  df0 = pd.concat([df0, pd.DataFrame(lagged_cols)], axis=1)
  return df0

# Train Set

In [76]:
#Train an ensemble model of the dataset
def train_set(df0, coin):

  proc = df0.dropna(inplace=False)
  train_df = proc.drop(columns=['datetime','train_gap','to_pred'])
  cols = list(train_df.columns)
  x_cols = [x for x in cols if 'lag' in x]
  y_cols = [y for y in cols if not 'lag' in y]
  X = train_df[x_cols]
  y = train_df[y_cols]
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
  scaler_x = MinMaxScaler().fit(X_train)
  scaler_y = MinMaxScaler().fit(y_train)

  # transform the feature columns
  X_train = scaler_x.transform(X_train)
  y_train = scaler_y.transform(y_train)

  X_test = scaler_x.transform(X_test)
  y_test = scaler_y.transform(y_test)

  # Model 1: Linear Regression
  model_lr = LinearRegression()
  model_lr.fit(X_train, y_train)

  # Model 2: Random Forest
  model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
  model_rf.fit(X_train, y_train)

  # Model 3: XGBoost
  model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
  model_xgb.fit(X_train, y_train)

  # Evaluate models and calculate weights
  models = [model_lr, model_rf, model_xgb]
  model_names = ['Linear Regression', 'Random Forest', 'XGBoost']
  predictions = []
  errors = []
  for model in models:
      preds = model.predict(X_test)
      mse = mean_squared_error(y_test, preds)
      predictions.append(preds)
      errors.append(mse)

  # Calculate weights inversely proportional to MSE
  weights = [(1 / error) if error != 0 else 0 for error in errors]
  weights = [weight / sum(weights) for weight in weights]  # Normalize weights

  final_prediction = np.zeros_like(predictions[0])
  for i in range(len(models)):
    final_prediction += weights[i] * predictions[i]

  ensemble_mse = mean_squared_error(y_test, final_prediction)
  print(f"Ensemble Mean Squared Error for {coin}: {ensemble_mse}")

  def predict(X):
    # Check for NaN or infinity
    if np.any(np.isnan(X)) or np.any(np.isinf(X)):
        raise ValueError("Input X contains NaN or infinity values.")
    X = scaler_x.transform(X).astype(np.float64)
    preds = []
    for model in models:
      try:
        preds.append(model.predict(X))
      except:
        print(max(X))
        print(X)
        raise

    if len(preds) == 0:
        raise ValueError("No predictions were made. Check model input and configuration.")
    preds_array = np.array(preds)

    final_prediction = np.zeros_like(preds_array[0])
    for i in range(len(models)):
        final_prediction += weights[i] * preds_array[i]

    if final_prediction.ndim == 1:
        final_prediction = final_prediction.reshape(-1, 1)  # Reshape if necessary

    final_prediction = scaler_y.inverse_transform(final_prediction)
    return final_prediction

  return predict

# Generate Pred

In [77]:
#Using the trained model, fill the gaps in the dataset.
def fill_gaps(df0, predict, coin):

  df_final = df0.copy()
  columns_to_fill = [x for x in list(df0.columns) if not 'lag' in x]
  columns_to_fill.remove('datetime')
  columns_to_fill.remove('train_gap')
  columns_to_fill.remove('to_pred')

  for idx, row in df_final.iterrows():
    # If train_gap is 0, skip to the next row
    if row['train_gap'] == 0:
        continue

    # Step 1: Handle NaNs in columns with 'lag' in their name
    lag_columns = [col for col in df_final.columns if '_lag_' in col]

    # Fill NaN values in lag columns based on previous rows
    for col in lag_columns:
        if pd.isna(row[col]):
            # Extract the base column name and the lag number
            base_col, lag_num = col.rsplit('_lag_', 1)
            lag_num = int(lag_num)

            # Fill NaN by going back 'lag_num' rows
            if idx - lag_num >= 0:
                df_final.at[idx, col] = df_final.at[idx - lag_num, base_col]

    # Step 2: If all lag columns are filled but non-lag columns have NaNs
    # Check if any lag columns are still NaN
    if not df_final.loc[idx, lag_columns].isna().any():
        # If no lag columns have NaNs but other columns do, call predict()
        if df_final.loc[idx].isna().any():
            # Prepare input data for predict() by extracting lag columns
            lag_features = df_final.loc[idx, lag_columns].values.reshape(1, -1)

            # Convert to DataFrame and replace infinities with NaN
            lag_features_df = pd.DataFrame(lag_features, columns=lag_columns)
            lag_features_df.replace([np.inf, -np.inf], np.nan, inplace=True)

            # Ensure that NaNs are handled before prediction
            if lag_features_df.isna().any().any():
                continue  # Skip if there are still NaNs in lag features

            # Call the predict() function to get predictions
            predicted_values = predict(lag_features_df)

            # Step 3: Fill NaN values in columns that are not lag columns
            for i, col in enumerate(columns_to_fill):
                if pd.isna(df_final.at[idx, col]):
                    df_final.at[idx, col] = predicted_values[0, i]

  return df_final

# Reshape

In [78]:
#Get the dataframe to be in the format of the submission
def reshape(df_final, df2, coin):
  df_final['company'] = coin
  df_new = df_final[df_final['datetime'].isin(df2['datetime'])]
  df_new = df_final[['company', 'datetime', 'close_x']]
  return df_new

# The Loop

In [None]:
#submission_df = pd.DataFrame(columns=['company', 'datetime', 'close_x'])
df_list = []
for coin in coin_list:
  file1 = f'{data_dir}/{coin}.csv'
  df1 = pd.read_csv(file1)
  df1 = df1.astype({col: 'float64' for col in df1.select_dtypes(include=['float']).columns})

  df1['datetime'] = pd.to_datetime(df1['datetime'])

  file_2 = f'{test_dir}/{coin}.csv'
  df2 = pd.read_csv(file_2)
  df2 = df2.astype({col: 'float64' for col in df2.select_dtypes(include=['float']).columns})
  df2['datetime'] = pd.to_datetime(df2['datetime'])

  df0 = make_dataset(df1,df2,coin)
  predict = train_set(df0, coin)
  df_final = fill_gaps(df0, predict, coin)
  df_new = reshape(df_final, df2, coin)
  df_list.append(df_new)

#df_list will finallly contain 40 elements, each of which is a dataframe.

# Post Processing to format the data

In [85]:
sub_df = pd.DataFrame(columns=['company', 'datetime', 'close_x'])
for df in df_list:
  sub_df = pd.concat([sub_df, df], ignore_index=True)

In [86]:
#Load the sample format
sam = pd.read_csv("/content/sample_submission.csv")

In [87]:
len(sam)

6019

In [88]:
sam

Unnamed: 0,company,datetime,close_x
0,AGI,2018-07-28,0.1
1,AGI,2018-07-29,0.1
2,AGI,2018-07-30,0.1
3,AGI,2018-07-31,0.1
4,AGI,2018-08-01,0.1
...,...,...,...
6014,ZEN,2020-10-08,0.1
6015,ZEN,2020-10-09,0.1
6016,ZEN,2020-10-10,0.1
6017,ZEN,2020-10-11,0.1


In [None]:
l = []
for i, row in sam.iterrows():
  filtered_df = sub_df[(sub_df['datetime'] == row['datetime']) & (sub_df['company'] == row['company'])]
  if not filtered_df.empty:
    # If not empty, append the first row's values to l
    l.append(filtered_df.values[0])
  else:
    # If empty, handle the case (e.g., append a placeholder or skip)
    print(f"No matching row found in sub_df for datetime: {row['datetime']}, company: {row['company']}")

In [None]:
#the last column of l is the close prices:
sam['close_x'] = [i[-1] for i in l]
sam.to_csv("final_submission.csv", index=False)