In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
%matplotlib inline
RANDOM_SEED = 43
np.random.seed(RANDOM_SEED)



# Train

In [2]:
# Load the data
df = pd.read_json('./data/train.json')
# Create Data
X_train, y_train = [], []
for im_band1, im_band2, label in zip(df['band_1'], df['band_2'], df['is_iceberg']):
    X_train.append(im_band1 + im_band2)
    y_train.append(label)    
X_train = np.array(X_train)
y_train = np.array(y_train)
print 'X_train.shape:', X_train.shape
print 'y_train.shape:', y_train.shape

X_train.shape: (1604, 11250)
y_train.shape: (1604,)


In [22]:
# PCA
N_dims = 32
pca_model = PCA(n_components=N_dims)
# lda_model = LinearDiscriminantAnalysis(n_components=1)
X_pca = pca_model.fit_transform(X_train, y_train)
print 'X_pca.shape:', X_pca.shape

# Make model
pca_model = PCA(n_components=N_dims, random_state=RANDOM_SEED)
estimator_model = xgb.XGBClassifier(max_depth=5,
                                    n_estimators=64)
steps = [pca_model, estimator_model]
model = make_pipeline(*steps)

X_pca.shape: (1604, 32)


In [23]:
result = cross_validate(model, X_pca, y_train, scoring='neg_log_loss', cv=5)
result

{'fit_time': array([ 0.24011111,  0.25279784,  0.24177814,  0.25111318,  0.24195814]),
 'score_time': array([ 0.02539802,  0.00882006,  0.01686096,  0.0088129 ,  0.00885296]),
 'test_score': array([-0.32468742, -0.34774547, -0.37178887, -0.34198841, -0.42222464]),
 'train_score': array([-0.15372359, -0.1565502 , -0.15415276, -0.15691926, -0.1415243 ])}

In [24]:
np.mean(result['test_score'])

-0.36168696164510644

In [6]:
# Create the model
model.fit(X_train, y_train)
model

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=16, random_state=43,
  svd_solver='auto', tol=0.0, whiten=False)), ('xgbclassifier', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=64, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

# Predict Test

In [8]:
df_test = pd.read_json('./data/test.json')
# Create Data
X_test = []
for im_band1, im_band2 in tqdm_notebook(zip(df_test['band_1'], df_test['band_2'])):
    X_test.append(im_band1 + im_band2)
X_test = np.array(X_test)
print 'X_test.shape:', X_test.shape

A Jupyter Widget


X_test.shape: (8424, 11250)


In [9]:
y_test_p = model.predict_proba(X_test)[:, 1]
print y_test_p.shape

(8424,)


In [10]:
df_sub = pd.DataFrame()
df_sub['id'] = df_test['id']
df_sub['is_iceberg'] = y_test_p

In [12]:
df_sub.to_csv('./submissions/sub2.csv', index=False)