<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Initialize-Random-Number-Generator" data-toc-modified-id="Initialize-Random-Number-Generator-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Initialize Random Number Generator</a></span></li><li><span><a href="#Load-the-Dataset" data-toc-modified-id="Load-the-Dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load the Dataset</a></span><ul class="toc-item"><li><span><a href="#Prepare-the-data" data-toc-modified-id="Prepare-the-data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Prepare the data</a></span></li></ul></li><li><span><a href="#Neural-Network-Model" data-toc-modified-id="Neural-Network-Model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Neural Network Model</a></span></li><li><span><a href="#XGBOOST" data-toc-modified-id="XGBOOST-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>XGBOOST</a></span><ul class="toc-item"><li><span><a href="#Baseline-Model" data-toc-modified-id="Baseline-Model-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Baseline Model</a></span></li><li><span><a href="#XGBOOST-extended" data-toc-modified-id="XGBOOST-extended-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>XGBOOST extended</a></span><ul class="toc-item"><li><span><a href="#Split-data-into-training-and-validation-sets" data-toc-modified-id="Split-data-into-training-and-validation-sets-4.2.1"><span class="toc-item-num">4.2.1&nbsp;&nbsp;</span>Split data into training and validation sets</a></span></li><li><span><a href="#Fit-model-on-training-data" data-toc-modified-id="Fit-model-on-training-data-4.2.2"><span class="toc-item-num">4.2.2&nbsp;&nbsp;</span>Fit model on training data</a></span></li><li><span><a href="#Make-predictions-for-validation-data" data-toc-modified-id="Make-predictions-for-validation-data-4.2.3"><span class="toc-item-num">4.2.3&nbsp;&nbsp;</span>Make predictions for validation data</a></span></li><li><span><a href="#evaluate-predictions" data-toc-modified-id="evaluate-predictions-4.2.4"><span class="toc-item-num">4.2.4&nbsp;&nbsp;</span>evaluate predictions</a></span></li></ul></li><li><span><a href="#Monitoring-performance-and-Early-Stopping" data-toc-modified-id="Monitoring-performance-and-Early-Stopping-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Monitoring performance and Early Stopping</a></span></li></ul></li></ul></div>

# Initialize Random Number Generator

In [1]:
import numpy as np

In [2]:
seed = 4242
np.random.seed(seed)

# Load the Dataset

In [3]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [4]:
train_data = pd.read_csv('post-data/train.csv').drop(columns=['Unnamed: 0'])
train_data.head()

Unnamed: 0,camera,fname,a0,a1,a2,s0,s1,s2
0,HTC-1-M7,(HTC-1-M7)1.jpg,118.223227,115.003896,111.967798,57.210818,54.533104,64.853878
1,HTC-1-M7,(HTC-1-M7)10.jpg,128.141141,122.825341,107.254211,56.619943,55.002498,59.375556
2,HTC-1-M7,(HTC-1-M7)100.jpg,98.61066,89.714463,82.452863,68.868366,73.298074,73.463918
3,HTC-1-M7,(HTC-1-M7)101.jpg,109.477162,104.011235,102.435257,46.961676,50.286467,54.133753
4,HTC-1-M7,(HTC-1-M7)102.jpg,105.730427,70.830199,103.63127,17.31078,18.811962,15.00888


In [5]:
test_data = pd.read_csv('post-data/test.csv').drop(columns=['Unnamed: 0'])
test_data.head()

Unnamed: 0,fname,a0,a1,a2,s0,s1,s2
0,img_0002a04_manip.tif,187.380531,181.926029,173.161304,34.441726,34.237523,28.06743
1,img_001e31c_unalt.tif,171.23299,140.81707,121.201546,66.119656,69.017232,71.617331
2,img_00275cf_manip.tif,86.310349,85.68969,79.065269,52.221585,54.244189,53.314759
3,img_0034113_unalt.tif,152.006332,140.76897,132.285995,70.564771,65.590737,58.753384
4,img_00344b7_unalt.tif,95.538319,100.303024,92.712677,70.115464,65.860571,64.039072


In [6]:
cols = ['a0', 'a1', 'a2', 's0', 's1', 's2']


In [7]:
X_train = train_data[cols].values
X_test = test_data[cols].values
print(X_train.shape)
print(X_test.shape)

(2750, 6)
(2640, 6)


In [8]:
train_labels = train_data['camera'].values
train_labels

array(['HTC-1-M7', 'HTC-1-M7', 'HTC-1-M7', ..., 'Motorola-X', 'Motorola-X',
       'Motorola-X'], dtype=object)

In [9]:
train_labels.shape

(2750,)

## Prepare the data

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from keras.utils import np_utils

Using TensorFlow backend.


In [11]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(train_labels)
encoded_Y = encoder.transform(train_labels)

In [12]:
# convert integers to dummy variables (i.e one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [13]:
x_train, y_train = shuffle(X_train, dummy_y, random_state=seed)

In [14]:
x_train[:5]

array([[  58.84471296,   49.93487104,   38.90922862,   42.87861481,
          40.24568119,   37.57830517],
       [ 114.54920658,  129.65230923,  129.65298077,   66.03141506,
          63.74333362,   81.53580846],
       [ 116.17599433,  128.95713377,  142.21829793,   59.62550104,
          61.02571382,   72.62387086],
       [ 140.24306599,  143.70790006,  144.47462227,   64.46722264,
          63.8214634 ,   69.86607987],
       [ 109.5898624 ,  117.45532843,  108.18150496,   66.5807312 ,
          60.25748265,   66.96279974]])

In [15]:
y_train[:5]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.]])

# Neural Network Model

In [16]:
from keras import models
from keras import layers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [17]:
x_train.shape[1]

6

In [18]:
# defiine baseline model
def baseline_model():
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu',
                           input_shape=(x_train.shape[1], )))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(10, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', metrics=['acc'])
    return model

In [19]:
clf = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=32, verbose=0)

In [25]:
clf.fit(x_train, y_train, )

<keras.callbacks.History at 0x7fc7e3f7da90>

In [20]:
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)

In [21]:
results = cross_val_score(clf, x_train, y_train, cv=kfold)

In [22]:
results

array([ 0.3       ,  0.34727273,  0.31636364,  0.09090909,  0.32727273])

In [26]:
preds = clf.predict(X_test)

In [27]:
preds[1]

1

In [29]:
predictions = [encoder.inverse_transform(pred) for pred in preds]

In [30]:
predictions

['LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'Sony-NEX-7',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexus-5x',
 'LG-Nexu

# XGBOOST

The most important XGBoost parameters are as follows:

- ```eta (default=0.3)```: This is the equivalent of the learning rate in Scikit-learn's GBM
- ```min_child_weight (default=1)```: Higher values prevent overfitting and tree complexity
- ```max_depth (default=6)```: This is the number of interactions in the trees
- ```subsample (default=1)```: This is a fraction of samples of the training data that we take in each iteration
- ```colsample_bytree (default=1)```: This is the fraction of features in each iteration
- ```lambda (default=1)```: This is the L2 regularization (Boolean)
- ```seed (default=0)```: This is the equivalent of Scikit-learn's random_state parameter, allowing reproducibility of learning processes across multiple tests and different machines



## Baseline Model

In [None]:
clf = xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=.5)

In [None]:
%%time
xgm = clf.fit(X_train, y_train)

In [None]:
y_pred = xgm.predict(X_test)

In [None]:
y_pred.shape

In [None]:
y_pred[0:10]

In [None]:
subm = pd.read_csv('submissions/sample_submission.csv', index_col='fname')
subm.head()

In [None]:
subm['camera'] = y_pred
subm.head()

In [None]:
subm.to_csv('submissions/xgboost-1.csv')

![](subm1.png)

## XGBOOST extended

### Split data into training and validation sets

### Fit model on training data

In [None]:
model = xgb.XGBClassifier()

In [None]:
model.fit(x_train, train_labels)

### Make predictions for validation data

### evaluate predictions

In [None]:
accuracy

In [None]:
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## Monitoring performance and Early Stopping

In [None]:
encoder = LabelEncoder()

In [None]:
y = encoder.fit_transform(y_train)

In [None]:
y[0]

In [None]:
params = {}
params['objective'] = 'multi:softmax'
params['eval_metric'] = 'mlogloss'
params['num_class'] = 10
#params['tree_method'] = 'exact'
params['silent'] = 0
params['max_bin'] = 16
params['tree_method'] = 'gpu_hist'



In [None]:
d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)
watchlist = [(d_valid,'eval'),(d_train,'train')]

In [None]:
clf = xgb.train(params, d_train, 300, watchlist, early_stopping_rounds=25, verbose_eval=True)

In [None]:
pred = clf.predict(d_valid)

In [None]:
np.unique(pred)

In [None]:
encoder.inverse_transform(y[0])

In [None]:
error_rate = np.sum(pred != )

In [None]:
from xgboost import plot_importance

In [None]:
plot_importance(clf)
plt.show()

In [None]:
clf.attributes()

In [None]:
clf.predict()