In [44]:
import numpy as np
from cross_validation import cross_validation

In [45]:
X_train = np.load("pure_features.npy")
y_train = np.load("reg_y.npy")
print(X_train.shape)
print(y_train.shape)

(250000, 15)
(250000,)


# 1) Data Imputation

In [46]:
for i in range(X_train.shape[1]):
    X_train[np.where(X_train[:,i] == -999)[0], i] = np.median(X_train[np.where(X_train[:,i] != -999)[0], i])

# 2) Polynomial Features

In [47]:
degree = 7

for i in range(X_train.shape[1]):
    for j in range(2, degree+1):
        new_col = X_train[:,i]**j
        X_train = np.column_stack((X_train, new_col))

X_train.shape

(250000, 105)

# 3) Data Normalization

In [58]:
# In the test section
for i in range(X_train.shape[1]):
    col_val = X_train[:, i]
    X_train[:, i] = (col_val - np.min(col_val)) / (np.max(col_val) - np.min(col_val)) 
    
print(X_train.min())
print(X_train.max())

0.0
1.0


# 4) Rank Deficiency Test

In [50]:
# Rank of train set must be equal to the feature number
# in order not to have Rank deficiency veya ill condition.

print("X_train rank: {}".format(np.linalg.matrix_rank(X_train)))
print(X_train.shape)

X_train rank: 6
(250000, 97)


In [28]:
qr = np.linalg.qr(X_train)[1]
arr = []

for i in range(X_train.shape[1]):
    if np.sum(abs(qr[i])) < 0.3:
        arr.append(i)
        
print(arr)

[89]


[89, 98, 103, 104]
[89, 100]
89
89

In [49]:
X_train = np.delete(X_train, np.s_[89, 98, 103, 104], axis=1)  
X_train = np.delete(X_train, np.s_[89, 100] , axis=1)  
X_train = np.delete(X_train, np.s_[89] , axis=1)  
X_train = np.delete(X_train, np.s_[89] , axis=1)  

# CROSS VALIDATION

In [59]:
from implementations import least_squares

par = {}
par['threshold'] = np.linspace(-0.4, 0.4, 9)
print(par['threshold'])
acc_tr, acc_te = cross_validation(y_train, X_train, 5, h_pars=par, model='least')

[-0.4 -0.3 -0.2 -0.1  0.   0.1  0.2  0.3  0.4]


In [60]:
print("Train Set:")
print(acc_tr)
print("Valid Set:")
print(acc_te)

Train Set:
[[-0.4       0.720349]
 [-0.3       0.749722]
 [-0.2       0.770933]
 [-0.1       0.784642]
 [ 0.        0.790046]
 [ 0.1       0.787257]
 [ 0.2       0.776894]
 [ 0.3       0.759253]
 [ 0.4       0.736502]]
Valid Set:
[[-0.4       0.71978 ]
 [-0.3       0.749092]
 [-0.2       0.770864]
 [-0.1       0.784512]
 [ 0.        0.79008 ]
 [ 0.1       0.787128]
 [ 0.2       0.776508]
 [ 0.3       0.758712]
 [ 0.4       0.736416]]


# Test Set

In [51]:
from proj1_helpers import load_csv_data

test_set = load_csv_data('../test.csv')
y_test, X_test, ids, columns = test_set

print("Test_X shape: {}".format(X_test.shape))
print("\nTest_Y shape: {}".format(y_test.shape))
print("\nColumn names: {}".format(columns))
print("\nColumn shape: {}".format(columns.shape))

n_features = len(columns)

Test_X shape: (568238, 30)

Test_Y shape: (568238,)

Column names: ['DER_mass_MMC' 'DER_mass_transverse_met_lep' 'DER_mass_vis' 'DER_pt_h'
 'DER_deltaeta_jet_jet' 'DER_mass_jet_jet' 'DER_prodeta_jet_jet'
 'DER_deltar_tau_lep' 'DER_pt_tot' 'DER_sum_pt' 'DER_pt_ratio_lep_tau'
 'DER_met_phi_centrality' 'DER_lep_eta_centrality' 'PRI_tau_pt'
 'PRI_tau_eta' 'PRI_tau_phi' 'PRI_lep_pt' 'PRI_lep_eta' 'PRI_lep_phi'
 'PRI_met' 'PRI_met_phi' 'PRI_met_sumet' 'PRI_jet_num'
 'PRI_jet_leading_pt' 'PRI_jet_leading_eta' 'PRI_jet_leading_phi'
 'PRI_jet_subleading_pt' 'PRI_jet_subleading_eta' 'PRI_jet_subleading_phi'
 'PRI_jet_all_pt']

Column shape: (30,)


# 1) Feature Selection

In [52]:
selected_features = np.sort([0,4,5,6,12,23,3,9,11,1,10,13,21,22,29])
X_test = X_test[:, selected_features]
X_test.shape

(568238, 15)

# 2) Data Transformation

In [53]:
log_transformed_columns = [3,9,13,21,29,5,23]
dummy = np.zeros(X_test.shape[0]).reshape(X_test.shape[0], -1)

for x, i in enumerate(selected_features):
    
    added_column = np.zeros(X_test.shape[0])
    
    if i in log_transformed_columns:
        if i in [5,23]:
            added_column[np.where(X_test[:,x] != -999)] = np.log(X_test[np.where(X_test[:,x] != -999),x] + 1)
            added_column[np.where(X_test[:,x] == -999)] = -999
        else:
            added_column = np.log(X_test[:, x] + 1)
    else:
        added_column = X_test[:, x]
        
    dummy = np.column_stack((dummy, added_column))

X_test = np.delete(dummy,0,1)
X_test.shape

(568238, 15)

# 3) Data Imputation

In [54]:
for i in range(X_test.shape[1]):
    X_test[np.where(X_test[:,i] == -999)[0], i] = np.median(X_test[np.where(X_test[:,i] != -999)[0], i])
    
X_test.shape

(568238, 15)

# 4) Polynomial Features

In [55]:
degree = 7

for i in range(X_test.shape[1]):
    for j in range(2, degree+1):
        new_col = X_test[:,i]**j
        X_test = np.column_stack((X_test, new_col))

X_test.shape

(568238, 105)

In [56]:
X_test = np.delete(X_test, np.s_[89, 98, 103, 104], axis=1)  
X_test = np.delete(X_test, np.s_[89, 100] , axis=1)  
X_test = np.delete(X_test, np.s_[89] , axis=1)  
X_test = np.delete(X_test, np.s_[89] , axis=1)  
X_test.shape

(568238, 97)

# 5) Data Normalization

In [57]:
for i in range(X_test.shape[1]):
    col_val = X_train[:, i]
    X_test[:, i] = (X_test[:, i] - np.min(col_val)) / (np.max(col_val) - np.min(col_val)) 
    
print(X_test.shape)
print(X_test.min())
print(X_test.max())

(568238, 97)
-0.27276547867787504
31.267262426454877


# PREDICTION

In [61]:
from proj1_helpers import predict_labels

w, _ = least_squares(y_train, X_train)
predictions = predict_labels(w, X_test)

In [63]:
from proj1_helpers import create_csv_submission

create_csv_submission(ids, predictions, 'output.csv')

### Versiyonlar:

1) Threshold = 0, degree=3 Polynomial Feature, Normalization, Imputation w/ median, log transformation = 0.77418

2) Threshold = 0, degree=6 Polynomial Feature, Normalization, Imputation w/ median, Bazı featurelar çıkarıldı rank deficiency önlemek için sırasıyla 89 77 87 77 77 columnlar log transformation = 0.78783

3) Threshold = 0, degree=7 Polynomial Feature, Normalization, Imputation w/ test_set median, Bazı featurelar çıkarıldı rank deficiency önlemek için sırasıyla [89, 98, 103, 104] [89, 100] 89 89 columnlar log transformation = 0.79094