In [6]:
import os
import os.path as osp
def find_root_folder(starting_path = os.getcwd()):
    from os.path import isfile, join, dirname
    path = starting_path
    i = 0
    found = False
    while i < 100:
        found = ('LICENSE' in [f for f in os.listdir(path) if isfile(join(path, f))])
        if found:
            break
        else:
            i += 1
            print(path)
            path = dirname(path)            
    if found:
        print(f"Found directory with license at {path}")
    else:
        raise FileNotFoundError("Could not find LICENSE file in ancestral directory")
    return path
ROOT_FOLDER = find_root_folder()
DS_FOLDER = osp.join(ROOT_FOLDER,'kddbr-2022')

oct23_dir = osp.join(ROOT_FOLDER,'predictions','Oct23_lgb')
oct23_zip = osp.join(ROOT_FOLDER,'predictions','Oct23_lgb.zip')

if osp.isdir(oct23_dir):
    print("Oct23_lgb prediction folder found. All good!")
else:
    raise ValueError(f"Please unzip {oct23_zip} before running this script")
# Arguments. Some are unused
DEVICE = 'cuda'
FOLD_SPLITS = 10
WHICH_VAL_FOLD = 5
RANDOM_STATE = 123
DEBUG = False
BATCH_SIZE = 128
NUM_EPOCHS = 200
VAL_EPOCH = 5



Found directory with license at /home/klaus/eclipse_draft/KDD2022_github
Oct23_lgb prediction folder found. All good!


In [2]:
import os

import cv2 as cv
import torch
import pandas as pd
import os.path as osp
import numpy as np
import torchvision
from torchvision.transforms import Normalize
from torch import tensor
import matplotlib.pyplot as plt
from tqdm import tqdm, trange

"""
Dataframe setup
"""
import sklearn
from sklearn.model_selection import KFold
def setup_dataframes():
    df = pd.read_csv(osp.join(DS_FOLDER, 'public.csv'))
    df_test, df_train = df[pd.isna(df.East)].reset_index(drop=True), df[~pd.isna(df.East)].reset_index(drop=True)
    df_trainval = df_train.copy()
    del df
    # Use K-Fold to separate between training and test
    kf = KFold(n_splits=FOLD_SPLITS,shuffle=True,random_state=RANDOM_STATE)
    N_trainval = df_train.shape[0]
    for i,(train_idx, val_idx) in enumerate(kf.split(np.arange(N_trainval)[:,None])):
        if i == WHICH_VAL_FOLD:
            df_val = df_train.loc[val_idx,:].reset_index(drop=True)
            df_train = df_train.loc[train_idx,:].reset_index(drop=True)

    return {'train': df_train, 'val':df_val, 'test':df_test, 'trainval':df_trainval}


#  Get Dataframe dictionary
dfs = setup_dataframes()


In [3]:
display(dfs['test'])

Unnamed: 0,Filename,Altitude,Delta,North,East
0,000053b1e684c9e7ea73727b2238ce18.jpg,167.943069,0.010269,,
1,00029153d12ae1c9abe59c17ff2e0895.jpg,195.853088,0.089218,,
2,0006246bee639c7a7b11a08e34dd3cc6.jpg,146.943466,-0.018326,,
3,00063cb5da1826febf178b669eea3250.jpg,213.184418,-0.108704,,
4,00063ece2e68a8847f228e8fd922f851.jpg,184.757767,0.017700,,
...,...,...,...,...,...
55026,fff87fc38496c1838a216c742e653065.jpg,191.309677,-0.509415,,
55027,fff98e721ed96517d940eb5b2daf2d18.jpg,171.288254,0.109970,,
55028,fff9e3be29d11366c43df7b2ae749547.jpg,196.589905,-0.118561,,
55029,fffae68750a8bd5e6ba46b25ce7030de.jpg,176.795364,0.023209,,


In [25]:
#!pip install lightgbm 
import lightgbm as lgb
import matplotlib.pyplot as plt
from tqdm import trange
def rmse(x,y):
    return np.sqrt(np.mean(np.square(x-y)))


def get_flight_df(mode):
    if mode == 'train' or mode == 'trainval':
        return pd.read_csv(osp.join(ROOT_FOLDER,'flight','flight_train.csv'))
    elif mode == 'test':
        return pd.read_csv(osp.join(ROOT_FOLDER,'flight','flight_test.csv'))
    else:
        raise ValueError('Invalid value for parameter "mode"')

def get_feature_df(mode):
    sub_names = ['PDCNet']
    cnames = []
    for sub in sub_names:
        cols = list(pd.read_csv(osp.join(ROOT_FOLDER, 'features',sub, f'{mode}.csv')).columns)
        cnames += [sub+ '_' + x for x in cols]
    feat_df = pd.concat([pd.read_csv(osp.join(ROOT_FOLDER, 'features',sub, f'{mode}.csv')) for sub in sub_names],1)
    feat_df.columns = cnames
    return feat_df

def get_lgb_dataframe(mode):
        feat_df = get_feature_df(mode)
        feat_df['Filename'] = dfs[mode]['Filename']

        if mode == 'trainval':
            flight_df = get_flight_df('train')
        else:
            flight_df = get_flight_df('test')
        flight_df['step_is_neg'] = np.array((flight_df['step'].values == -1)).astype(np.float32)

        # Merge
        df = flight_df.copy()
        df = df.merge(feat_df, on='Filename').reset_index(drop=True)

        feat_df = feat_df.drop('Filename',axis=1)

        df_Y = df.loc[:,['North','East']].copy()
        df_X = df.loc[:,['Altitude','Delta', 'diff','step_is_neg'] + list(feat_df.columns) ]
        def shift(df, n):
            #assert n>= 0
            df = df.shift(-n)
            #display(df)
            df.columns = [f'shift_{n}_{c}' for c in df.columns]
            return df
        df_X = pd.concat([shift(df_X,i) for i in [-1,-2, 0, 1, 2]], axis=1)
        return df, df_X, df_Y



def verify_lgb(target_var):
    
    

    train_df, train_df_X, train_df_Y = get_lgb_dataframe('trainval')
    
    test_df, test_df_X, test_df_Y = get_lgb_dataframe('test')

    N_val = train_df_X.shape[0] // 10
    val_idx = np.arange(train_df_X.shape[0])[:N_val]#np.where(np.arange(feat_df.shape[0]) < N_val)[0]
    train_idx = np.arange(train_df_X.shape[0])[N_val:] #= np.where(np.arange(feat_df.shape[0]) >= N_val)[0]

    X_trainval, Y_trainval = train_df_X.copy(), train_df_Y[target_var].copy()

    X_train, Y_train = train_df_X.iloc[train_idx,:].copy(), train_df_Y.iloc[train_idx,:][target_var].copy()
    
    
    """ Create validation, test matrices"""
    X_val, Y_val = train_df_X.iloc[val_idx,:].copy(), train_df_Y.iloc[val_idx,:][target_var].copy()
    X_test, Y_test = test_df_X, test_df_Y[target_var].copy()

    
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(X_train, Y_train)
    lgb_eval = lgb.Dataset(X_val, Y_val, reference=lgb_train)
    lgb_test = lgb.Dataset(X_test, Y_test, reference=lgb_train)


    # specify your configurations as a dict
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': {'rmse'},
        'num_leaves': 128,
        'metric_freq': 10,
        'learning_rate': 0.005,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.5,
        'bagging_freq': 5,
        'verbose': 1
    }

    
    # train
    gbm_filename = osp.join(ROOT_FOLDER,'predictions','Oct23_lgb',f'gbm_{target_var.lower()}.txt')
    submitted_filename = osp.join(ROOT_FOLDER,'predictions','Oct23_lgb',f'submitted_to_kaggle.csv')
    print(f'Loading saved model from {gbm_filename}...')
    gbm = lgb.Booster(model_file=gbm_filename)
    
    print("Using loaded model to predict test set...")
    test_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    test_pred = test_pred[np.argsort(test_df['Filename'])] 
    print("Using loaded model to predict test set... done!")
    
    
    submitted_df = pd.read_csv(submitted_filename)
    if target_var.lower() == 'north':
        submitted_df = submitted_df.iloc[::2,:]
    else:
        submitted_df = submitted_df.iloc[1::2,:]
    print(f"Loaded CSV submitted to kaggle (found at f{submitted_filename})")
    print(f"Selected submission rows related to {target_var} predictions")
        
        
    comparison_df = pd.DataFrame({'Id':  submitted_df['Id'].values,
                                  f'loaded_model_prediction_{target_var}':test_pred,
                                  f'submitted_prediction_{target_var}': submitted_df['Predicted'].values})
    
    vA = comparison_df[f'loaded_model_prediction_{target_var}'].values
    vB = comparison_df[f'submitted_prediction_{target_var}'].values
    print("Mean squared error between \n A) The predictions made by the loaded model" +
         f" and \n B) The predictions for the {target_var} variable on the submitted CSV \n is " +
         f"{np.square(vA-vB).mean()}")
    display(comparison_df)
    #trainval_pred = trainval_pred[np.argsort(train_df['Filename'])] 
    

# Train Lightgbm model for North and East predictions separately.
verify_lgb('North')
verify_lgb('East')


  feat_df = pd.concat([pd.read_csv(osp.join(ROOT_FOLDER, 'features',sub, f'{mode}.csv')) for sub in sub_names],1)
  feat_df = pd.concat([pd.read_csv(osp.join(ROOT_FOLDER, 'features',sub, f'{mode}.csv')) for sub in sub_names],1)


Loading saved model from /home/klaus/eclipse_draft/KDD2022_github/predictions/Oct23_lgb/gbm_north.txt...
Using loaded model to predict test set...
Using loaded model to predict test set... done!
Loaded CSV submitted to kaggle (found at f/home/klaus/eclipse_draft/KDD2022_github/predictions/Oct23_lgb/submitted_to_kaggle.csv)
Selected submission rows related to North predictions
Mean squared error between 
 A) The predictions made by the loaded model and 
 B) The predictions for the North variable on the submitted CSV 
 is 8.755173069927131e-33


Unnamed: 0,Id,loaded_model_prediction_North,submitted_prediction_North
0,000053b1e684c9e7ea73727b2238ce18.jpg:North,0.010881,0.010881
1,00029153d12ae1c9abe59c17ff2e0895.jpg:North,1.024498,1.024498
2,0006246bee639c7a7b11a08e34dd3cc6.jpg:North,-1.921857,-1.921857
3,00063cb5da1826febf178b669eea3250.jpg:North,-1.587566,-1.587566
4,00063ece2e68a8847f228e8fd922f851.jpg:North,0.242386,0.242386
...,...,...,...
55026,fff87fc38496c1838a216c742e653065.jpg:North,1.455134,1.455134
55027,fff98e721ed96517d940eb5b2daf2d18.jpg:North,-0.005934,-0.005934
55028,fff9e3be29d11366c43df7b2ae749547.jpg:North,1.123954,1.123954
55029,fffae68750a8bd5e6ba46b25ce7030de.jpg:North,-0.731201,-0.731201


  feat_df = pd.concat([pd.read_csv(osp.join(ROOT_FOLDER, 'features',sub, f'{mode}.csv')) for sub in sub_names],1)
  feat_df = pd.concat([pd.read_csv(osp.join(ROOT_FOLDER, 'features',sub, f'{mode}.csv')) for sub in sub_names],1)


Loading saved model from /home/klaus/eclipse_draft/KDD2022_github/predictions/Oct23_lgb/gbm_east.txt...
Using loaded model to predict test set...
Using loaded model to predict test set... done!
Loaded CSV submitted to kaggle (found at f/home/klaus/eclipse_draft/KDD2022_github/predictions/Oct23_lgb/submitted_to_kaggle.csv)
Selected submission rows related to East predictions
Mean squared error between 
 A) The predictions made by the loaded model and 
 B) The predictions for the East variable on the submitted CSV 
 is 8.284685008158348e-33


Unnamed: 0,Id,loaded_model_prediction_East,submitted_prediction_East
0,000053b1e684c9e7ea73727b2238ce18.jpg:East,-0.151246,-0.151246
1,00029153d12ae1c9abe59c17ff2e0895.jpg:East,-0.809308,-0.809308
2,0006246bee639c7a7b11a08e34dd3cc6.jpg:East,0.170217,0.170217
3,00063cb5da1826febf178b669eea3250.jpg:East,0.972320,0.972320
4,00063ece2e68a8847f228e8fd922f851.jpg:East,-1.351789,-1.351789
...,...,...,...
55026,fff87fc38496c1838a216c742e653065.jpg:East,1.368100,1.368100
55027,fff98e721ed96517d940eb5b2daf2d18.jpg:East,1.989513,1.989513
55028,fff9e3be29d11366c43df7b2ae749547.jpg:East,1.100941,1.100941
55029,fffae68750a8bd5e6ba46b25ce7030de.jpg:East,-1.673310,-1.673310
