## Find duplicated columns:

In [None]:
dup_cols = {}

for i, c1 in enumerate(tqdm_notebook(train_enc.columns)):
    for c2 in train_enc.columns[i + 1:]:
        if c2 not in dup_cols and np.all(train_enc[c1] == train_enc[c2]):
            dup_cols[c2] = c1

In [None]:
import cPickle as pickle
pickle.dump(dup_cols, open('dup_cols.p', 'w'), protocol=pickle.HIGHEST_PROTOCOL)
traintest.drop(dup_cols.keys(), axis = 1,inplace=True)

## Unique values

In [None]:
nunique = train.nunique(dropna=False)
plt.figure(figsize=(14,6))
_ = plt.hist(nunique.astype(float)/train.shape[0], bins=100)

## Split categorical/numerical

In [None]:
cat_cols = list(train.select_dtypes(include=['object']).columns)
num_cols = list(train.select_dtypes(exclude=['object']).columns)

In [None]:
train.loc[:,cat_cols].head().T

## Validation schemes

a) Holdout scheme:

Split train data into two parts: partA and partB.
Fit the model on partA, predict for partB.
Use predictions for partB for estimating model quality. Find such hyper-parameters, that quality on partB is maximized.

b) K-Fold scheme:

Split train data into K folds.
Iterate though each fold: retrain the model on all folds except current fold, predict for the current fold.
Use the predictions to calculate quality on each fold. Find such hyper-parameters, that quality on each fold is maximized. You can also estimate mean and variance of the loss. This is very helpful in order to understand significance of improvement.

c) LOO (Leave-One-Out) scheme:

Iterate over samples: retrain the model on all samples except current sample, predict for the current sample. You will need to retrain the model N times (if N is the number of samples in the dataset).
In the end you will get LOO predictions for every sample in the trainset and can calculate loss.

Stratification


## Flatten images and standardize

In [None]:
# Reshape the training and test examples 
train_x_flatten = train_x_orig.reshape(train_x_orig.shape[0], -1).T   # The "-1" makes reshape flatten the remaining dimensions
test_x_flatten = test_x_orig.reshape(test_x_orig.shape[0], -1).T

# Standardize data to have feature values between 0 and 1.
train_x = train_x_flatten/255.
test_x = test_x_flatten/255.

## Sparse matrices

In [2]:
from scipy import sparse
from numpy import array
I = array([0,3,1,0])
J = array([0,3,1,2])
V = array([4,5,7,9])
A =sparse.coo_matrix((V,(I,J)),shape=(4,4))
A.todense()

matrix([[4, 0, 9, 0],
        [0, 7, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 5]])

In [39]:
nb=4
I = array([1,2,1,3])
J = array([0,3,2,0])
V = np.ones(nb)
A =sparse.coo_matrix((V,(I,J)),shape=(nb, nb))
print(A.toarray())
A = A.tocsr()

[[ 0.  0.  0.  0.]
 [ 1.  0.  1.  0.]
 [ 0.  0.  0.  1.]
 [ 1.  0.  0.  0.]]


In [40]:
A[I, :].todense()

matrix([[ 1.,  0.,  1.,  0.],
        [ 0.,  0.,  0.,  1.],
        [ 1.,  0.,  1.,  0.],
        [ 1.,  0.,  0.,  0.]])

In [41]:
A[J, :].todense()

matrix([[ 0.,  0.,  0.,  0.],
        [ 1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.],
        [ 0.,  0.,  0.,  0.]])

In [42]:
A[I, :].multiply(A[J, :]).todense()

matrix([[ 0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.]])

In [21]:
A =sparse.coo_matrix((V,(I,J)),shape=(nb, nb))
print(A.toarray())
np.unique(A.toarray())

[[ 1.  0.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  2.]]


array([ 0.,  1.,  2.])

In [24]:
L = 1.01 
C = 0.3
((-L - np.log( 1 - C )) /(np.log(C)- np.log( 1 - C)))

0.77106893172456126

## Keras

In [None]:
import numpy as np
from keras import layers
from keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D
from keras.models import Model, load_model
from keras.preprocessing import image
from keras.utils import layer_utils
from keras.utils.data_utils import get_file
from keras.applications.imagenet_utils import preprocess_input
import pydot
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model
from resnets_utils import *
from keras.initializers import glorot_uniform
import scipy.misc
from matplotlib.pyplot import imshow
%matplotlib inline

import keras.backend as K
K.set_image_data_format('channels_last')
K.set_learning_phase(1)

In [None]:
plot_model(happyModel, to_file='HappyModel.png')
SVG(model_to_dot(happyModel).create(prog='dot', format='svg'))

In [None]:
Y_train = convert_to_one_hot(Y_train_orig, 6).T
Y_test = convert_to_one_hot(Y_test_orig, 6).T

Data augmentation : 10 crop

Oversampling, undersampling, SMOTE

http://contrib.scikit-learn.org/imbalanced-learn/stable/generated/imblearn.over_sampling.SMOTE.html

In [4]:
from imblearn.over_sampling import SMOTE 
from sklearn.datasets import make_classification
from collections import Counter

In [27]:
X, y = make_classification(n_classes=2, class_sep=2,
weights=[0.2, 0.8], n_informative=1, n_redundant=1, flip_y=0,
n_features=3, n_clusters_per_class=1, n_samples=40, random_state=10)
# print(X)
print(len(y))
print(X.shape)

40
(40, 3)


In [19]:
Counter(y)

Counter({0: 4, 1: 16})

In [26]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X, y)
print(Counter(y_res))
# X_res

Counter({0: 32, 1: 32})


## Stratified sampling

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html

In [30]:
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.5, random_state=0)
sss.get_n_splits(X, y)

3

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.5)


## Kohen kappa

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.cohen_kappa_score.html

<img src="https://i.stack.imgur.com/kYNd6.png">

In [1]:
from sklearn.metrics import cohen_kappa_score
labeler1 = [2, 0, 2, 2, 0, 1]
labeler2 = [0, 0, 2, 2, 0, 2]
cohen_kappa_score(labeler1, labeler2)

0.4285714285714286

## Notebook tips:

https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/


In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
a=5
b=6
a
b

5

6

The %who command without any arguments will list all variables that existing in the global scope. Passing a parameter like str will list only variables of that type.

In [5]:
%who str

No variables match your requested type.


## Distance based features - use nearest neighbor as alternative

In [None]:
gb = df.groupby(['user_id', 'page_id'], as_index=False).agg(
    {'ad_price':{'max_price':np.max, 'min_price':np.min}})

gb.columns = ['user_id', 'page_id', 'max_price', 'min_price']

## Manifold learning (dimensionality reduction for nonlinear functions) - tSNE
for linear functions -> matrix factorization

Bagging : create multiple learners and combine them together (average, ensemble)
boosting: understand what are the weak learners and give them different weight. In practice make a prediction, calculate the error. Error becomes y. Make a prediction, calculate error, sum it with previous error => new y


<img src="stacking.PNG">

H2O models: https://github.com/h2oai/h2o-3/blob/master/h2o-py/demos/H2O_tutorial_breast_cancer_classification.ipynb


<img src="tips.PNG">