In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale

import math
from scipy import linalg

import ite
from ite.cost.x_kernel import Kernel

In [5]:
# Import data
def load_data():
    full_data = pd.read_csv("X.csv")
    train_y = pd.read_csv("ytr.csv")
    # Rename columns to something more interpretable
    columns = (["reflectance_" + str(i) for i in range(7)]
               + ["solar_" + str(i) for i in range(5)] + ["id"])
    full_data.columns = columns
    
    # Move ID column to the beginning
    id_column = full_data["id"]
    full_data.drop(labels=["id"], axis=1, inplace = True)
    full_data.insert(0, "id", id_column)
    # Add y to the data frame
    split = 98000
    y_id_dict = train_y.set_index("Id")["y"].to_dict()
    full_data.loc[:(split-1), "y"] = full_data.loc[:(split-1), "id"].map(y_id_dict)

    train, test = full_data[:split], full_data[split:]
    return (train, test)

#columns = (["id"] + ["reflectance_" + str(i) for i in range(7)]
 #          + ["solar_" + str(i) for i in range(5)] + ["y"])
#full_data = pd.read_csv("MODIS.csv", header=None, names=columns)
#split = 98000
#train, test = full_data[:split].copy(), full_data[split:].copy()
#train = full_data.copy()

#train_copy, test_copy = load_data()

train, test = load_data()
#full_data = load_data()

# Parameters
n_threads = -1
random_seed = 8888

### Data generation for MATLAB

In [3]:
full_data = full_data.fillna(0)

In [4]:
cols_excl = ["id", "y"]
cols_orig = [c for c in full_data.columns if c not in cols_excl]

# Standardise data for LR
full_data[cols_orig] = scale(full_data[cols_orig])

# I did standardise the data
full_data.to_csv("aerosol_full.csv", encoding="utf-8", index=False, header=False)

### Preprocessing

In [6]:
cols_excl = ["id", "y"]
cols_orig = [c for c in train.columns if c not in cols_excl]

# Standardise data for LR
train[cols_orig] = scale(train[cols_orig])
test[cols_orig] = scale(test[cols_orig])

### Python mean embedding

In [None]:
def mean_embedding(X1, X2, kernel):
    k = Kernel(kernel)
    gram_mat = k.gram_matrix2(X1, X2)
    # Number of instances in the bag
    N = float(gram_mat.shape[0])
    mu_X1_X2 = gram_mat.ravel().sum() / N**2
    return (mu_X1_X2)

nb_bag = 200#train["id"].nunique()
K_matrix = np.zeros((nb_bag, nb_bag))

theta = 10**(-8)
for i in range(nb_bag):
    for j in range(nb_bag):
        # Compute mean embedding
        X1 = train.loc[train["id"] == (i+1), cols_orig].values
        X2 = train.loc[train["id"] == (j+1), cols_orig].values
        
        K_matrix[i, j] = mean_embedding(X1, X2, {'name': 'invmquadr','c': theta})

y_train = train["y"].unique()[:200].reshape((-1, 1))
l2_reg = 10**(-8)

y_hat = []
test_id_list = np.arange(980, 1100, dtype=int)

for test_id in test_id_list:
    K_test = np.zeros((1, nb_bag))

    for j in range(nb_bag):
        X1 = train.loc[train["id"] == (j+1), cols_orig].values
        X2 = test.loc[test["id"] == test_id, cols_orig].values
        K_test[0, j] = mean_embedding(X1, X2, {'name': 'invmquadr','c': theta})

    # Ridge regression
    ridge_mat = K_matrix + (l2_reg * nb_bag) * np.identity(nb_bag)
    ridge_mat_inv = np.linalg.solve(ridge_mat, np.identity(nb_bag))
    y_hat.append((K_test.dot(ridge_mat_inv)).dot(y_train)[0, 0])

y_hat = np.array(y_hat)

np.sqrt(mean_squared_error(y_hat, test["y"].unique()[:120]))

### Ridge regression

In [85]:
G

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1354,1355,1356,1357,1358,1359,1360,1361,1362,1363
0,0.99115,0.95533,0.96953,0.97313,0.90892,0.88619,0.90212,0.97110,0.90806,0.87803,...,0.96049,0.94496,0.95995,0.94675,0.94940,0.92285,0.84110,0.92942,0.96949,0.83544
1,0.95533,0.99469,0.96207,0.94358,0.95819,0.88985,0.91499,0.93228,0.95982,0.88996,...,0.94393,0.95251,0.94306,0.94819,0.88346,0.94540,0.90334,0.91338,0.98574,0.83059
2,0.96953,0.96207,0.99615,0.96699,0.94579,0.90377,0.92736,0.94237,0.92925,0.89890,...,0.92861,0.94613,0.98415,0.93898,0.89434,0.92872,0.89344,0.93631,0.95608,0.84378
3,0.97313,0.94358,0.96699,0.97993,0.89163,0.85203,0.87352,0.97583,0.88128,0.84208,...,0.94533,0.93588,0.95025,0.94480,0.94427,0.89111,0.82675,0.89738,0.95650,0.80943
4,0.90892,0.95819,0.94579,0.89163,0.99568,0.93604,0.94812,0.85539,0.97786,0.94144,...,0.90460,0.94495,0.95163,0.92092,0.81505,0.96807,0.97096,0.94915,0.93908,0.89581
5,0.88619,0.88985,0.90377,0.85203,0.93604,0.96013,0.93965,0.82410,0.92527,0.95463,...,0.87441,0.88672,0.92927,0.85739,0.80811,0.95090,0.90706,0.95655,0.88756,0.89462
6,0.90212,0.91499,0.92736,0.87352,0.94812,0.93965,0.96502,0.83803,0.95009,0.96303,...,0.86562,0.89411,0.92878,0.86929,0.80195,0.93751,0.93102,0.93285,0.89720,0.89227
7,0.97110,0.93228,0.94237,0.97583,0.85539,0.82410,0.83803,0.99302,0.85219,0.80690,...,0.95036,0.91635,0.92264,0.93231,0.97069,0.87046,0.77955,0.87065,0.95675,0.76951
8,0.90806,0.95982,0.92925,0.88128,0.97786,0.92527,0.95009,0.85219,0.98305,0.94247,...,0.89569,0.92653,0.92341,0.90504,0.80999,0.95952,0.95186,0.92631,0.93783,0.87656
9,0.87803,0.88996,0.89890,0.84208,0.94144,0.95463,0.96303,0.80690,0.94247,0.97662,...,0.85270,0.87840,0.91320,0.84805,0.78260,0.94249,0.92922,0.94007,0.87638,0.91068


In [155]:
# Import Gram matrices
# Theta=16
G = pd.read_csv("Cauchy_16_prec.csv", header=None)
nb_bag_train = 980

idx_train = nb_bag_train*4/5 - 1
# Number of train bags after setting aside a validation set
L_train = 980*4/5
idx_test = nb_bag_train
G_train = G.loc[:idx_train, :idx_train]
l2_reg = 2**(-23)

In [156]:
ridge_mat = G_train.values + (l2_reg * L_train) * np.identity(L_train)

In [157]:
G_test = G.loc[:idx_train, idx_test:].values

In [158]:
y_train = train.groupby("id")["y"].median().values[:L_train].reshape((1, -1))

In [159]:
y_train.shape

(1, 784)

In [164]:
linalg.inv(ridge_mat)

array([[  8.77981386e+03,  -8.43105769e+01,  -4.49667931e+00, ...,
          1.63981002e+01,   1.37791367e+02,   3.95052640e+01],
       [ -8.43105769e+01,   9.11051133e+03,  -9.61313104e+01, ...,
         -2.21348521e+01,   1.41806517e+01,  -7.88297002e+01],
       [ -4.49667930e+00,  -9.61313104e+01,   9.01539697e+03, ...,
         -6.00859109e+01,   1.40965214e+02,  -5.07456807e+00],
       ..., 
       [  1.63981002e+01,  -2.21348521e+01,  -6.00859108e+01, ...,
          9.49027770e+03,  -4.77472667e+00,   1.89699626e+01],
       [  1.37791367e+02,   1.41806517e+01,   1.40965214e+02, ...,
         -4.77472668e+00,   8.60306626e+03,   3.54231616e+01],
       [  3.95052640e+01,  -7.88297002e+01,  -5.07456807e+00, ...,
          1.89699626e+01,   3.54231616e+01,   9.22094238e+03]])

In [160]:
np.linalg.solve(ridge_mat, G_train)

array([[ 0.17943787,  0.00787967,  0.00042026, ..., -0.00153257,
        -0.01287799, -0.00369217],
       [ 0.00787967,  0.14853085,  0.00898444, ...,  0.00206873,
        -0.00132532,  0.00736743],
       [ 0.00042026,  0.00898444,  0.15742025, ...,  0.00561563,
        -0.01317462,  0.00047427],
       ..., 
       [-0.00153257,  0.00206873,  0.00561563, ...,  0.11303786,
         0.00044625, -0.00177293],
       [-0.01287799, -0.00132532, -0.01317462, ...,  0.00044625,
         0.19595671, -0.00331065],
       [-0.00369217,  0.00736743,  0.00047427, ..., -0.00177293,
        -0.00331065,  0.13820996]])

In [161]:
y_test_hat = y_train.dot(np.linalg.solve(ridge_mat, G_train))

In [165]:
y_test_hat

array([[-3.54210541, -4.36901474, -2.46632853, -3.65516823, -3.79170122,
        -2.97453006, -2.92730329, -4.15556986, -4.03557199, -3.08474442,
        -3.69951016, -3.10463215, -4.52790111, -4.62900411, -4.2879592 ,
        -1.48809922, -3.85707846, -4.22062233, -3.60454135, -2.77681627,
        -1.75014004, -4.76282215, -3.3359833 , -4.08133128, -4.60206254,
        -5.02167471, -3.9092074 , -3.91205609, -4.2955005 , -3.73622675,
        -1.90012078, -2.29649496, -1.94365281, -2.9638622 , -4.07819226,
        -3.02148717, -3.60030072, -4.14731348, -3.66793206, -3.3944616 ,
        -3.61961744, -4.53242571, -4.75616109, -2.81232932, -3.81501037,
        -3.71206542, -4.0626766 , -3.73487709, -2.81794127, -2.86641083,
        -3.97274976, -4.57874299, -3.82278579, -3.28320867, -3.09115922,
        -2.32980467, -3.27538748, -3.95977944, -4.75872728, -2.10110084,
        -4.97803318, -4.39274374, -4.11029919, -4.48169803, -3.67572566,
        -2.61440613, -2.1108654 , -4.24221275, -3.0

In [162]:
def RMSE(y, y_hat):
    out = np.sqrt(mean_squared_error(y, y_hat))
    return (out)

In [163]:
RMSE(y_test_hat.reshape((-1,)), y_train.reshape((-1,)))
#0.55274397545759535

0.55274695545247921

In [16]:
math.log(1.1921e-07 , 2)

-22.99999140202484

*Prediction 9: RMSE-validation: 0.72545.
Optimal theta: 16, lambda: 1.1921e-07 Public LB: 0.67536 base_kp.^[2:4]