In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale

import ite
from ite.cost.x_kernel import Kernel

In [20]:
# Import data
def load_data():
    full_data = pd.read_csv("X.csv")
    train_y = pd.read_csv("ytr.csv")
    # Rename columns to something more interpretable
    columns = (["reflectance_" + str(i) for i in range(7)]
               + ["solar_" + str(i) for i in range(5)] + ["id"])
    full_data.columns = columns
    
    # Move ID column to the beginning
    id_column = full_data["id"]
    full_data.drop(labels=["id"], axis=1, inplace = True)
    full_data.insert(0, "id", id_column)
    # Add y to the data frame
    split = 98000
    y_id_dict = train_y.set_index("Id")["y"].to_dict()
    full_data.loc[:(split-1), "y"] = full_data.loc[:(split-1), "id"].map(y_id_dict)

    train, test = full_data[:split], full_data[split:]
    return (train, test)

#columns = (["id"] + ["reflectance_" + str(i) for i in range(7)]
 #          + ["solar_" + str(i) for i in range(5)] + ["y"])
#full_data = pd.read_csv("MODIS.csv", header=None, names=columns)
#split = 98000
#train, test = full_data[:split].copy(), full_data[split:].copy()
#train = full_data.copy()

#train_copy, test_copy = load_data()

train, test = load_data()
#full_data = load_data()

# Parameters
n_threads = -1
random_seed = 8888

### Data generation for MATLAB

In [3]:
full_data = full_data.fillna(0)

In [4]:
cols_excl = ["id", "y"]
cols_orig = [c for c in full_data.columns if c not in cols_excl]

# Standardise data for LR
full_data[cols_orig] = scale(full_data[cols_orig])

# I did standardise the data
full_data.to_csv("aerosol_full.csv", encoding="utf-8", index=False, header=False)

In [84]:
cols_excl = ["id", "y"]
cols_orig = [c for c in train.columns if c not in cols_excl]

# Standardise data for LR
train[cols_orig] = scale(train[cols_orig])
test[cols_orig] = scale(test[cols_orig])

In [86]:
def mean_embedding(X1, X2, kernel):
    k = Kernel(kernel)
    gram_mat = k.gram_matrix2(X1, X2)
    # Number of instances in the bag
    N = float(gram_mat.shape[0])
    mu_X1_X2 = gram_mat.ravel().sum() / N**2
    return (mu_X1_X2)

In [87]:
nb_bag = 200#train["id"].nunique()
K_matrix = np.zeros((nb_bag, nb_bag))

In [88]:
theta = 10**(-8)
for i in range(nb_bag):
    for j in range(nb_bag):
        # Compute mean embedding
        X1 = train.loc[train["id"] == (i+1), cols_orig].values
        X2 = train.loc[train["id"] == (j+1), cols_orig].values
        
        K_matrix[i, j] = mean_embedding(X1, X2, {'name': 'invmquadr','c': theta})

In [89]:
y_train = train["y"].unique()[:200].reshape((-1, 1))
l2_reg = 10**(-8)

In [92]:
y_hat = []
test_id_list = np.arange(980, 1100, dtype=int)

for test_id in test_id_list:
    K_test = np.zeros((1, nb_bag))

    for j in range(nb_bag):
        X1 = train.loc[train["id"] == (j+1), cols_orig].values
        X2 = test.loc[test["id"] == test_id, cols_orig].values
        K_test[0, j] = mean_embedding(X1, X2, {'name': 'invmquadr','c': theta})

    # Ridge regression
    ridge_mat = K_matrix + (l2_reg * nb_bag) * np.identity(nb_bag)
    ridge_mat_inv = np.linalg.solve(ridge_mat, np.identity(nb_bag))
    y_hat.append((K_test.dot(ridge_mat_inv)).dot(y_train)[0, 0])

y_hat = np.array(y_hat)

np.sqrt(mean_squared_error(y_hat, test["y"].unique()[:120]))

0.2731445077288715

In [108]:
train_embedded = train.groupby("id").median().reset_index()
test_embedded = test.groupby("id").median().reset_index()
train_embedded
l2_reg = 10**(-8)
nb_bag = train_embedded.shape[0]

In [112]:
ridge_mat = train_embedded[cols_orig] + (l2_reg * nb_bag) * np.identity(nb_bag)
ridge_mat_inv = np.linalg.solve(ridge_mat, np.identity(nb_bag))

ValueError: Unable to coerce to DataFrame, shape must be (980, 12): given (980, 980)

In [24]:
import math
math.log(7.6294e-06, 2)

-16.999998965876564

In [10]:
1.5**8

2.25

In [16]:
1.5**15

437.8938903808594

In [23]:
train.shape[0]/100/5

196

In [25]:
train.shape[0] + test.shape[0]

136400

In [26]:
train["y"].unique()

array([-3.99808191, -4.13714105, -2.69473212, -3.29627534, -3.18139117,
       -3.14678354, -3.43856643, -4.39907508, -3.39237263, -2.60223637,
       -4.11039839, -2.08640688, -5.35805464, -5.12220846, -4.21001865,
       -0.8670277 , -4.48802009, -4.39658337, -4.29604588, -2.03581068,
       -0.99666043, -5.24513797, -3.04661412, -4.29719543, -4.38663866,
       -5.94105589, -3.88609744, -4.01776668, -4.34295177, -3.41677722,
       -0.72971582, -3.02610403, -1.64883   , -3.50051908, -3.99103006,
       -2.32925824, -3.60836795, -4.20569003, -3.63817098, -3.07831123,
       -3.6993882 , -4.6078159 , -6.05716984, -2.81786315, -3.43564509,
       -2.97718336, -4.07486526, -3.4836806 , -2.26905547, -2.76912928,
       -3.94256358, -4.93078584, -3.37990423, -2.57574955, -3.04653056,
       -2.66190094, -2.91393133, -3.60796954, -4.4637667 , -1.95568115,
       -4.05498015, -4.31294593, -4.26680721, -4.68292804, -2.9580676 ,
       -3.11307148, -1.86729749, -3.89673054, -2.86041591, -1.62