# Brain Connevtivity Image Supper Reolution

In [1]:
# necessary imports
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
import random as r
# specifying random seed of 1 for reproducibility of the results
r.seed(1)

In [2]:
# selecting the best k features based on a quick linear regression model
def select_KBest(X_train, X_test, y_train, k):
    
    # fitting the simple linear regression model & getting the top k features
    fs = SelectKBest(score_func=f_regression, k=k)
    fs.fit(X_train, y_train)
    
    # transforming the train and test data to have only the chose k features
    X_train = fs.transform(X_train)
    X_test = fs.transform(X_test)
    
    return X_train, X_test

In [3]:
# training the LGBM model by getting the input data, and parameters
def train_predict(X_train, X_test, y_train, y_test, k, params):
    
    # creating a list for predicted columns
    y_hat_list = []
    
    # iterating over target columns
    for col in y_train.columns:
        
        # selecting the top k features
        X_train_sel, X_test_sel = select_KBest(X_train, X_test, y_train[col], k)
        
        # fitting the LGBM model on the training data using the input parameters
        regressor = LGBMRegressor(**params)
        regressor.fit(X_train_sel, y_train[col])
        
        # making the predictions and adding the results into the list
        y_hat = regressor.predict(X_test_sel)
        y_hat_list.append(pd.DataFrame({col:y_hat}))
        
    # concatinating the target column in a column-wise manner
    y_hat_df = pd.concat(y_hat_list, axis=1)
    
    # returning the output as a dataframe
    return y_hat_df


In [4]:
# doing the K-Fold CV using the training data and all the input parameters
def Do_KFold_CV(train_lr, train_hr, num_fold, k, params, output_prefix):
    
    # specifiying the split of the data into num_fold folds
    kf = KFold(n_splits=num_fold, random_state=True, shuffle=True)
    
    f = 1
    actuals_list, predictions_list, fold_scores = [], [], []
    
    # doing the split
    for train_index, test_index in kf.split(train_lr):
        
        # getting the train and test sets
        X_train, X_test = train_lr.iloc[train_index], train_lr.iloc[test_index]
        y_train, y_test = train_hr.iloc[train_index], train_hr.iloc[test_index]
        
        # training and making the predictions by calling the train_predict function
        preds = train_predict(X_train, X_test, y_train, y_test, k, params)
        
        # appending the actual and predicted dataframes
        actuals_list.append(y_test)
        predictions_list.append(preds)
        
        # melting and calculating the MSE
        fold_mse = mse(y_test.to_numpy().flatten(), preds.to_numpy().flatten())
        fold_scores.append(fold_mse)
        print ('Fold %s is finished with an mse of %.5f'%(f, fold_mse))
        f += 1
    
    # concatinating the actual y_test sets in a row-wise manner ; the aim here is to reconstruct
    # the original train_hr and also correlating prediction dataframe with all 189 samples
    actuals = pd.concat(actuals_list, axis=0)
    # adding the index as a column named ID
    actuals['ID'] = actuals.index
    # sorting the values by ID 
    actuals.sort_values('ID', inplace=True)
    # set ID as the index
    actuals.set_index('ID', inplace=True, drop=True)
    
    # similarly doing the same for predictions
    predictions = pd.concat(predictions_list, axis=0)
    predictions['ID'] = actuals.index
    predictions.sort_values('ID', inplace=True)
    predictions.set_index('ID', inplace=True, drop=True)
    # now they both have the same sorted indices.
    
    # melting the two dataframe 
    melted_actuals = actuals.to_numpy().flatten()
    melted_predictions = predictions.to_numpy().flatten()
    # creating a dataframe from melted predictions
    predicted = pd.DataFrame({'predicted':melted_predictions})
    # naming the index as ID
    predicted.index.name = 'ID'
    # writing the predictions dataframe to a csv file in the format of Kaggle competition
    predicted.to_csv('%s.csv'%output_prefix)

    return np.mean(fold_scores)


In [5]:
# reading the input datasets
train_lr = pd.read_csv('../input/brain-connectivity-matrix-dataset/train_LR.csv')
train_hr = pd.read_csv('../input/brain-connectivity-matrix-dataset/train_HR.csv')

# specifying the parameters
# k_best for selecting the top k features
k_best = 2000 

# K-fold CV parameter for the number of folds
num_fold = 5

# hyperparameter of LGBM model
params = {'verbose': -1,
          'boosting_type': 'goss',
          'top_rate': 0.6, 
          'other_rate': 0.4}
# note that top_rate is the retain ratio of large gradient,
# and other_rate is the retain ratio of samll gradient.

output_prefix = 'predictions'

# putting them all together and runninf the 5-Fold CV
MSE_CV = Do_KFold_CV(train_lr, train_hr, num_fold, k_best, params, output_prefix)
print(' ')
print ('\033[1m' + '5-Fold CV finished with an mse of %.5f'%MSE_CV)

Fold 1 is finished with an mse of 0.02342
Fold 2 is finished with an mse of 0.02194
Fold 3 is finished with an mse of 0.02533
Fold 4 is finished with an mse of 0.02376
Fold 5 is finished with an mse of 0.02419
 
[1m5-Fold CV finished with an mse of 0.02373
