In [1]:
## Utilities ##

import rampwf as rw
import numpy as np
import pandas as pd
from typing import Dict
from typing import Tuple
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.random_projection import SparseRandomProjection
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import kneighbors_graph
from sklearn.metrics import pairwise_distances
from sklearn.utils.graph_shortest_path import graph_shortest_path
from scipy.stats import pearsonr, spearmanr
import multiprocessing as mp
import itertools
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor


def parallel(fn, params, n_jobs=mp.cpu_count()):
    pool = mp.Pool(n_jobs)
    print('started new process pool with {} processes'.format(n_jobs))
    try:
        res = pool.map(fn, params)
        pool.close()
        pool.join()
    except:
        print('process pool interrupted, shutting down')
        pool.terminate()
        pool.join()
        raise
    return res

mplog = mp.get_logger()

def pca(X_df, n_components=2):
    imputer = Imputer(strategy='median')
    scaler = StandardScaler()
    pca = PCA(n_components=n_components)
    pca_pipeline = Pipeline([('med_imputer', imputer),('scaler', scaler),('pca',pca)])
    X_pc = pca_pipeline.fit_transform(X_df)    
    pca_comp_feat_ratios = np.square(pca.components_)
    feat_names = X_df.columns
    pc_feat_contrib = pd.DataFrame(pca_comp_feat_ratios, columns=feat_names)
    return X_pc #, pc_feat_contrib, pca.components_, pca.explained_variance_ratio_

def rand_projection(X_df, n_components='auto', eps=0.1):
    imputer = Imputer(strategy='median')
    scaler = StandardScaler()
    proj = SparseRandomProjection(n_components=n_components, eps=eps)
    proj_pipeline = Pipeline([('med_imputer', imputer),('scaler', scaler),('proj', proj)])
    X_rp = proj_pipeline.fit_transform(X_df)
    return X_rp #, proj.n_components_

def umap(X_df, n_components=2, y=None, n_neighbors=5, min_dist=0.1, metric='correlation'):
    import warnings
    warnings.filterwarnings('ignore')
    imputer = Imputer(strategy='median')
    scaler = StandardScaler()
    umap = UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, metric=metric)
    umap_pipeline = Pipeline([('med_imputer', imputer),('scaler', scaler),('umap', umap)])
    X_umap = umap_pipeline.fit_transform(X_df, y)
    warnings.resetwarnings()
    return X_umap

In [7]:
# Cross Validation Generator

class TimeSeries(object):
    def __init__(self, n_cv=8, cv_block_size=0.5, period=12, unit='',
                 unit_2=None):
        self.n_cv = n_cv
        self.cv_block_size = cv_block_size
        self.period = period
        self.unit = unit
        self.unit_2 = unit_2

    def get_cv(self, X, y):
        n = len(y)
        block_size = int(
            n * self.cv_block_size / self.n_cv / self.period) * self.period
        n_common_block = n - block_size * self.n_cv
        n_validation = n - n_common_block
        #if self.unit_2 is None:
            #print('length of common block: {} {}s'.format(
             #   n_common_block, self.unit))
            #print('length of validation block: {} {}s'.format(
             #   n_validation, self.unit))
            #print('length of each cv block: {} {}s'.format(
             #  block_size, self.unit))
        #else:
            #print('length of common block: {} {}s = {} {}s'.format(
             #   n_common_block, self.unit, n_common_block / self.period,
             #   self.unit_2))
            #print('length of validation block: {} {}s = {} {}s'.format(
             #   n_validation, self.unit, n_validation / self.period,
             #   self. unit_2))
            #print('length of each cv block: {} {}s = {} {}s'.format(
             #   block_size, self.unit, block_size / self.period, self.unit_2))
        for i in range(self.n_cv):
            train_is = np.arange(0, n_common_block + i * block_size)
            test_is = np.arange(n_common_block + i * block_size, n)
            yield (train_is, test_is)
            
cv = TimeSeries(n_cv=8, cv_block_size=0.5, period=12, unit='month', unit_2='year')

#for train_is, test_is in cv.get_cv(X_df,y_df):
#    y = X_df.iloc[train_is,:]
#    print(y)
   

In [16]:
# Homemade Forest Parameter Tuning 
def forest(X_df, y_df, cv, max_depth, max_features, n_estimators, n_components=None):
    mse_sum = 0
    for train_index, test_index in cv.get_cv(X_df,y_df):
        forest_regressor = Pipeline([('imputer', Imputer(strategy='median')), 
            ('scale', StandardScaler()), 
            #('dim_red', PCA(n_components = n_components)),       
            ('regressor', RandomForestRegressor(max_depth = max_depth, max_features = max_features, n_estimators = n_estimators))])

        X_train, X_test = X_df.iloc[train_index,:], X_df.iloc[test_index,:]
        y_train, y_test = y_df.iloc[train_index], y_df.iloc[test_index]
        y_train, y_test = y_train.values.reshape(y_train.shape[0]), y_test.values.reshape(y_test.shape[0])

        forest_regressor.fit(X_train, y_train)
        y_pred = forest_regressor.predict(X_test)
        mse_sum += mean_squared_error(y_test, y_pred)
        
    return ((max_depth, max_features, n_estimators),np.sqrt(mse_sum/cv.n_cv))

def run_forest(p):
    d, f, ne = p
    return forest(X_all, y_df, cv, max_depth=d, max_features=f, n_estimators=ne)

# Homemade Tree Parameter Tuning 
def tree(X_df, y_df, cv, max_depth, max_features, n_components=None):
    mse_sum = 0
    for train_index, test_index in cv.get_cv(X_df,y_df):
        tree_regressor = Pipeline([('imputer', Imputer(strategy='median')), 
            ('scale', StandardScaler()),
            #('dim_red', PCA(n_components = n_components)), 
            ('regressor', DecisionTreeRegressor(max_depth = max_depth, max_features = max_features))])
        X_train, X_test = X_df.iloc[train_index,:], X_df.iloc[test_index,:]
        y_train, y_test = y_df.iloc[train_index], y_df.iloc[test_index]
        y_train, y_test = y_train.values.reshape(y_train.shape[0]), y_test.values.reshape(y_test.shape[0])
        tree_regressor.fit(X_train,y_train)
        y_pred = tree_regressor.predict(X_test)
        mse_sum += mean_squared_error(y_test, y_pred)
        
    return ((max_depth, max_features),np.sqrt(mse_sum/cv.n_cv))

def run_tree(p):
    d, f = p
    return tree(X_all, y_df, cv, max_depth=d, max_features=f)

# Homemade SVR Parameter Tuning 
def svr(X_df, y_df, cv, C, gamma, n_components=None):
    mse_sum = 0
    for train_index, test_index in cv.get_cv(X_df,y_df):
        rbf_regressor = Pipeline([('imputer', Imputer(strategy='median')),
            ('scale', StandardScaler()),
            #('dim_red', PCA(n_components = n_components)), 
            ('regressor', SVR(kernel = 'rbf', C = C, gamma = gamma))])
        
        X_train, X_test = X_df.iloc[train_index,:], X_df.iloc[test_index,:]
        y_train, y_test = y_df.iloc[train_index], y_df.iloc[test_index]
        y_train, y_test = y_train.values.reshape(y_train.shape[0]), y_test.values.reshape(y_test.shape[0])
        rbf_regressor.fit(X_train,y_train)
        y_pred = rbf_regressor.predict(X_test)
        mse_sum += mean_squared_error(y_test, y_pred)
        
    return ((C, gamma),np.sqrt(mse_sum/cv.n_cv))

def run_svr(p):
    c, g = p
    return svr(X_all, y_df, cv, C = c, gamma = g)

# Homemade Boosted Forest Parameter Tuning 
def boost(X_df, y_df, cv, max_depth, max_features, n_estimators, n_components=None):
    mse_sum = 0
    for train_index, test_index in get_cv(X_df,y_df):
        boost_regressor = Pipeline([('imputer', Imputer(strategy='median')), 
            ('scale', StandardScaler()),
            #('dim_red', PCA(n_components = n_components)), 
            ('regressor', GradientBoostingRegressor(max_depth = max_depth, max_features = max_features, n_estimators = n_estimators))])

        X_train, X_test = X_df.iloc[train_index,:], X_df.iloc[test_index,:]
        y_train, y_test = y_df.iloc[train_index], y_df.iloc[test_index]
        y_train, y_test = y_train.values.reshape(y_train.shape[0]), y_test.values.reshape(y_test.shape[0])

        forest_regressor.fit(X_train,y_train)
        y_pred = forest_regressor.predict(X_test)
        mse_sum += mean_squared_error(y_test, y_pred)
        
    return ((max_depth, max_features, n_estimators),np.sqrt(mse_sum/cv.n_cv))

def run_boost(p):
    d, f, ne = p
    return boost(X_all, y_df, cv, max_depth=d, max_features=f, n_estimators=ne)

# Homemade Lasso Parameter Tuning 
def lasso(X_df, y_df, cv, alpha, n_components=None):
    lasso_dict = {}
    for a in alpha:
        mse_sum = 0
        for train_index, test_index in cv.get_cv(X_df,y_df):
            lasso_regressor = Pipeline([('imputer', Imputer(strategy='median')),
                ('scale', StandardScaler()),
                #('dim_red', PCA(n_components = n_components)), 
                ('regressor', Lasso(alpha = a))])

            X_train, X_test = X_df.iloc[train_index,:], X_df.iloc[test_index,:]
            y_train, y_test = y_df.iloc[train_index], y_df.iloc[test_index]
            y_train, y_test = y_train.values.reshape(y_train.shape[0]), y_test.values.reshape(y_test.shape[0])

            lasso_regressor.fit(X_train,y_train)
            y_pred = lasso_regressor.predict(X_test)
            mse_sum += mean_squared_error(y_test, y_pred)

        lasso_dict[a] = np.sqrt(mse_sum/cv.n_cv)
    return lasso_dict

# Homemade Ridge Parameter Tuning 
def ridge(X_df, y_df, cv, alpha, n_components=None):
    ridge_dict = {}
    for a in alpha:
        mse_sum = 0
        for train_index, test_index in get_cv(X_df,y_df):
            ridge_regressor = Pipeline([('imputer', Imputer(strategy='median')),
                ('scale', StandardScaler()),
                #('dim_red', PCA(n_components = n_components)), 
                ('regressor', Ridge(alpha = a))])

            X_train, X_test = X_df.iloc[train_index,:], X_df.iloc[test_index,:]
            y_train, y_test = y_df.iloc[train_index], y_df.iloc[test_index]
            y_train, y_test = y_train.values.reshape(y_train.shape[0]), y_test.values.reshape(y_test.shape[0])

            ridge_regressor.fit(X_train,y_train)
            y_pred = ridge_regressor.predict(X_test)
            mse_sum += mean_squared_error(y_test, y_pred)

        ridge_dict[a] = np.sqrt(mse_sum/cv.n_cv)
    return ridge_dict


In [31]:
# Import Data
X_all = np.load('/Users/mchifala/csci-5622-project/sea_ice/sea_ice_X.npy')
y_df = pd.DataFrame(np.load('/Users/mchifala/csci-5622-project/sea_ice/sea_ice_y.npy'))
X_all = pd.DataFrame(np.reshape(X_all, (1500, (8*39*58))))


FileNotFoundError: [Errno 2] No such file or directory: '/Users/mchifala/csci-5622-project/sea_ice/sea_ice_X.npy'

In [9]:
%matplotlib inline
#from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr

# Load data
X_ds = xr.open_dataset('data/train.nc')
y_array = np.load('data/train.npy') 
X_arr = X_ds.to_array().values
X_arr = np.swapaxes(X_arr, 0, 1)
X_all = pd.DataFrame(X_arr.reshape(X_arr.shape[0], X_arr.shape[1]*X_arr.shape[2]*X_arr.shape[3]))
y_df = pd.DataFrame(y_array)

In [23]:
reg_dict = {}

max_depth = [5,10,20,25]
max_depth_tree = [5,10,15,20,25,50,100]
max_features = ['sqrt', 'log2']
n_estimators = [10,25,50,100,200]
alpha_lasso = [.00001, .0001, .001, .01, .1, 1]
alpha_ridge = [1, 2, 5, 10, 25, 50, 100, 200, 500, 1000]
C = np.logspace(-2, 2, num=5) 
gamma = np.logspace(-4, 2, num=7)

params_forest_boost = list(itertools.product(max_depth, max_features, n_estimators))
params_tree = list(itertools.product(max_depth, max_features))
params_svr = list(itertools.product(C, gamma))

results_forest = parallel(run_forest, params_forest_boost)
reg_dict['forest_regressor'] = dict(results_forest)
np.save('sea_ice_all_tuning.npy', reg_dict)
print("Saved!")

results_tree = parallel(run_tree, params_tree)
reg_dict['tree_regressor'] = dict(results_tree)
np.save('sea_ice_all_tuning.npy', reg_dict)
print("Saved!")

results_boost = parallel(run_boost, params_forest_boost)
reg_dict['boost_regressor'] = dict(results_boost)
np.save('sea_ice_all_tuning.npy',reg_dict)
print("Saved!")

results_svr = parallel(run_svr, params_svr)
reg_dict['svr_regressor'] = dict(results_svr)
np.save('sea_ice_all_tuning.npy', reg_dict)
print("Saved!")

reg_dict['ridge_regressor'] = ridge(X_all, y_df, alpha_ridge)
np.save('sea_ice_all_tuning.npy', reg_dict)
print("Saved!")

reg_dict['lasso_regressor'] = lasso(X_all, y_df, alpha_lasso)
np.save('sea_ice_all_tuning.npy', reg_dict)
print("Saved!")

TypeError: only integer scalar arrays can be converted to a scalar index