# Imports

In [29]:
import warnings
warnings.filterwarnings('ignore')
import string
import glob
import re
import itertools
import hashlib
from collections import Counter
import sys
import os
import numbers

import scipy as sp
import numpy as np
import pandas as pd

import plotly.plotly as py
import plotly.tools as tls
import plotly.graph_objs as go
import cufflinks as cf
cf.set_config_file(offline=False, world_readable=True, theme='pearl')
tls.set_credentials_file(username=os.environ.get('PLOTLY_USERNAME'), api_key=os.environ.get('PLOTLY_APIKEY'))

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

import seaborn as sns
sns.set_context("notebook",
                font_scale=1.5,
                rc={"lines.linewidth": 2.5})

In [30]:
%reload_ext watermark
%watermark -a "Ken Cavagnolo" -n -u -v -m -h -g -p numpy,scipy,pandas,matplotlib,plotly,seaborn

Ken Cavagnolo 
Last updated: Fri Mar 11 2016 

CPython 2.7.10
IPython 4.0.3

numpy 1.10.4
scipy 0.17.0
pandas 0.17.1
matplotlib 1.4.2
plotly 1.8.12
seaborn 0.7.0

compiler   : GCC 5.2.1 20151010
system     : Linux
release    : 4.2.0-23-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 4
interpreter: 64bit
host name  : ubuntu
Git hash   : 762e1a49cb5b3082cc559787fa5d5dbccf9fb594


# Read Data

In [31]:
# get datadir
import platform
uname = platform.uname()[0]
if uname == 'Linux':
    datadir = '/home/kcavagnolo/ml_fun/santander_cs/data/'
elif uname == 'Darwin':
    datadir = '/Users/cavagnolo/ml_fun/santander_cs/data/'
else:
    raise OSError("Unknown system: " + str(uname))
    
files = sorted(glob.glob(datadir + '*.csv'))
hdf_file = datadir + 'features.h5'

In [4]:
# reopen hdf store
hdf = pd.HDFStore(hdf_file)
print hdf
df_test = hdf['df_test']
df_train = hdf['df_train']
df_all = hdf['df_all']
hdf.close()

# reset indexing
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_all = df_all.reset_index(drop=True)

<class 'pandas.io.pytables.HDFStore'>
File path: /home/kcavagnolo/ml_fun/santander_cs/data/features.h5
/df_all               frame        (shape->[151838,325])
/df_sample            frame        (shape->[75818,2])   
/df_test              frame        (shape->[75818,371]) 
/df_train             frame        (shape->[76020,372]) 


# Summarize

TODO:

* Replace missing values
* Remove duplicates
* One-Hot encode categorical features
* Find outliers and explain
* Scale
* Standardize

In [32]:
# read sample submission
df = pd.read_csv(files[0], index_col=0)
df_sample = df.copy()

# build test df
df = pd.read_csv(files[1], index_col=0)
for c in df.columns:
    if df[c].dtypes == 'object':
        df[c] = df[c].map(lambda x: re.sub("[^0-9]", "", x))
df_test = df.copy()

# build train df
df = pd.read_csv(files[2], index_col=0)
for c in df.columns:
    if df[c].dtypes == 'object':
        df[c] = df[c].map(lambda x: re.sub("[^0-9]", "", x))
df_train = df.copy()

# show sizes
sample_size = df_sample.shape[0]
train_size = df_train.shape[0]
test_size = df_test.shape[0]

# retain train/test split
df_train['is_train'] = True
df_test['is_train'] = False

# merge df's
df_sample.reset_index(inplace=True)
df_train.reset_index(inplace=True)
df_test.reset_index(inplace=True)
df_all = pd.concat([df_train, df_test], axis=0, ignore_index=True)
all_size = df_all.shape[0]
assert train_size + test_size == all_size, "DF's are not summing correctly"
assert df_all[(df_all.is_train == True)].shape[0] == train_size, "Train split not preserved"
assert df_all[(df_all.is_train == False)].shape[0] == test_size, "Test split not preserved"

In [33]:
# go through each file and tell me what's in it
empty = []
binaries = []
categoricals = []
continuous = []
obs = df_all.shape[0]
for c in df_all.columns:
    uflag = ""
    nflag = ""
    uni = len(pd.unique(df_all[c].ravel()))
    nulls = np.count_nonzero(df_all[c].isnull())
    if uni == 1:
        empty.append(c)
    elif uni == 2:
        binaries.append(c)
    if np.issubdtype(df_all[c].dtypes, np.integer):
        categoricals.append(c)
    elif np.issubdtype(df_all[c].dtypes, np.float):
        continuous.append(c)
    if obs == uni:
        uflag = '*UNIQUE VALS PER KEY*'
    if nulls >0 :
        nflag = '*NULLS IN COL*'
    print '{:30} {:d} {:30} {:30}'.format(c, uni, uflag, nflag)
print '{:15} {:s}'.format('Singular cols:', empty)
print ''
print '{:15} {:s}'.format('Binary cols:', binaries)
print ''
print '{:15} {:s}'.format('Categorical cols:', categoricals)
print ''
print '{:15} {:s}'.format('Continuous cols:', continuous)
print ''
print '{:15} {:d}'.format('Observations:', obs)

ID                             151838 *UNIQUE VALS PER KEY*                                        
TARGET                         3                                *NULLS IN COL*                
delta_imp_amort_var18_1y3      2                                                              
delta_imp_amort_var34_1y3      2                                                              
delta_imp_aport_var13_1y3      51                                                              
delta_imp_aport_var17_1y3      11                                                              
delta_imp_aport_var33_1y3      13                                                              
delta_imp_compra_var44_1y3     29                                                              
delta_imp_reemb_var13_1y3      2                                                              
delta_imp_reemb_var17_1y3      3                                                              
delta_imp_reemb_var33_1y3      2         

* Column names look fine, don't need to clean those
* No nulls, hooray
* Looks like no column has no unique mapping to ID

Lots of empty cols, are they real? If so, ditch them.

In [34]:
a = []
for c in df_all.columns:
    if df_all[c].sum() == 0:
        a.append(c)
if len(empty) == len(set(a) & set(empty)):
    print df_all.shape
    print df_all.drop(empty, 1).shape

(151838, 372)
(151838, 338)


In [35]:
df_all.drop(empty, 1, inplace=True)

What about binary columns?

In [36]:
val = []
for c in binaries:
    print '{:30}{}'.format(c, sorted(df_all[c].unique()))

delta_imp_amort_var18_1y3     [0, 9999999999]
delta_imp_amort_var34_1y3     [0, 9999999999]
delta_imp_reemb_var13_1y3     [0, 9999999999]
delta_imp_reemb_var33_1y3     [0, 9999999999]
delta_imp_trasp_var17_out_1y3 [0, 9999999999]
delta_imp_trasp_var33_out_1y3 [0, 9999999999]
delta_num_reemb_var13_1y3     [0, 9999999999]
delta_num_reemb_var33_1y3     [0, 9999999999]
delta_num_trasp_var17_out_1y3 [0, 9999999999]
delta_num_trasp_var33_out_1y3 [0, 9999999999]
imp_reemb_var17_hace3         [0.0, 12027.15]
imp_reemb_var33_ult1          [0, 1200]
ind_var1                      [0, 1]
ind_var10_ult1                [0, 1]
ind_var10cte_ult1             [0, 1]
ind_var12                     [0, 1]
ind_var12_0                   [0, 1]
ind_var13                     [0, 1]
ind_var13_0                   [0, 1]
ind_var13_corto               [0, 1]
ind_var13_corto_0             [0, 1]
ind_var13_largo               [0, 1]
ind_var13_largo_0             [0, 1]
ind_var13_medio               [0, 1]
ind_var13_

What are all these 9999999999? Are they junk numbers?

In [37]:
all_nines = []
for c in binaries:
    cond = df_all[c] == 9999999999
    a = len(df_all[(cond)])
    b = len(df_all[(df_all[c] > 0)])
    if a > 0:
        print '{:30}{:5d}{:5d}'.format(c, a, b)
        all_nines.append(c)

delta_imp_amort_var18_1y3         3    3
delta_imp_amort_var34_1y3         4    4
delta_imp_reemb_var13_1y3        65   65
delta_imp_reemb_var33_1y3         1    1
delta_imp_trasp_var17_out_1y3     4    4
delta_imp_trasp_var33_out_1y3     2    2
delta_num_reemb_var13_1y3        65   65
delta_num_reemb_var33_1y3         1    1
delta_num_trasp_var17_out_1y3     4    4
delta_num_trasp_var33_out_1y3     2    2


Cols with all nines have a patternt to them, and there aren't many. Convert to 0, in which case the columns will be all 0's and need to be dropped, or convert to 1's and keep as an attribute? Keep.

In [38]:
all_nines

['delta_imp_amort_var18_1y3',
 'delta_imp_amort_var34_1y3',
 'delta_imp_reemb_var13_1y3',
 'delta_imp_reemb_var33_1y3',
 'delta_imp_trasp_var17_out_1y3',
 'delta_imp_trasp_var33_out_1y3',
 'delta_num_reemb_var13_1y3',
 'delta_num_reemb_var33_1y3',
 'delta_num_trasp_var17_out_1y3',
 'delta_num_trasp_var33_out_1y3']

In [39]:
for c in all_nines:
    df_all[c][(df_all[c] == 9999999999)] = 1

There are also lots of columns named <some_col> and <some_col_0>, are these identical? If so, drop the _0 ones.

In [40]:
dup_cols = []
cols = df_all.columns
for c in cols:
    dupe = c+'_0'
    if dupe in cols:
        if (df_all[c].values - df_all[dupe].values).sum() == 0:
            print 'Duplicates: ', c, dupe
            dup_cols.append(dupe)

Duplicates:  ind_var13_medio ind_var13_medio_0
Duplicates:  ind_var18 ind_var18_0
Duplicates:  ind_var25 ind_var25_0
Duplicates:  ind_var26 ind_var26_0
Duplicates:  ind_var32 ind_var32_0
Duplicates:  ind_var34 ind_var34_0
Duplicates:  ind_var37 ind_var37_0
Duplicates:  num_var18 num_var18_0
Duplicates:  num_var25 num_var25_0
Duplicates:  num_var26 num_var26_0
Duplicates:  num_var32 num_var32_0
Duplicates:  num_var34 num_var34_0
Duplicates:  num_var37 num_var37_0


In [41]:
df_all.drop(dup_cols, 1, inplace=True)

Keep a record of how many 'false' entries are in each row

In [42]:
df_all['cnt_0'] = (df_all.drop('is_train', 1) == 0).astype(int).sum(axis=1)

# Save

In [43]:
# check sizes
assert df_all[(df_all.is_train == True)].shape[0] == train_size, "Train split not preserved"
assert df_all[(df_all.is_train == False)].shape[0] == test_size, "Test split not preserved"

In [44]:
# save to hdf5 for easier loading later
hdf = pd.HDFStore(hdf_file)
hdf.put('df_sample', df_sample)
hdf.put('df_test', df_test)
hdf.put('df_train', df_train)
hdf.put('df_all', df_all)
hdf.close()

# Attribute Summaries

## Relations

In [None]:
df_all[['id','location', 'fault_severity']].scatter_matrix()

Looks like locations 100-500 don't produce any fault_sev==2 events. This could be useful to a model, so maybe don't throw out location as I had previously.

In [None]:
df_all.groupby(['location', 'fault_severity']).size()

## Histograms

In [None]:
# build axes
mpl_fig = plt.figure()
ax1 = mpl_fig.add_subplot(411)
ax2 = mpl_fig.add_subplot(412)
ax3 = mpl_fig.add_subplot(413)
ax4 = mpl_fig.add_subplot(414)
axs = [ax1, ax2, ax3, ax4]

# iterate over each
for cond in [True, False]:
    for i, b in enumerate(attributes):
        y = df_all[(df_all.is_train == cond)].filter(regex=b).apply(pd.value_counts).fillna(0).T[0]
        y = 100.*(1.0-(y/df_all[(df_all.is_train == cond)].shape[0]))
        x = range(1, len(y)+1)
        axs[i].bar(x, y, label=attributes[i])
        axs[i].set_ylabel(attributes[i])
py.iplot_mpl(mpl_fig, strip_style=True)

Distributions looks similar, so train will cv w/ test. Any structure in sequence of ID's?

## Sequencing

In [None]:
df = df_all[['id', 'fault_severity', 'location']].set_index('id')
df.iplot(subplots=True, shape=(2,1), shared_xaxes=True)

Both look like noise to me, gonna leave it for now. What about correlations among the various codes?

## Correlations

In [None]:
strongly_correlated = []
corr_thresh = 0.12

In [None]:
a = ['severity_type', 'event_type', 'resource_type']
for b in a:

    # correlation matrix
    c = df_all.filter(regex=b).copy()
    corr = c.corr(min_periods=len(c.columns)/10)

    # colormap
    cmap = sns.light_palette("navy", as_cmap=True, reverse=True)
    
    # clustered heatmap of distances w/o mask
    # dist 0 --> close, 1 --> distant
    dsim = 1.0-np.abs(corr)
    
    # find degenerate params
    mask = np.ones(dsim.shape,dtype='bool')
    mask[np.triu_indices(len(dsim))] = False
    x = ((dsim < corr_thresh) & mask).values.nonzero()
    a = zip(x[0], x[1])
    for x, y in a:
        i1 = dsim.index[x]
        i2 = dsim.columns[y]
        strongly_correlated.append((i1, i2))

    # plot matrix
    if len(dsim) > 10:
        annot=False
    else:
        annot=True
    plt.figure(figsize=(15, 12))
    g = sns.clustermap(data=dsim,
                       method='complete',
                       metric='correlation',
                       cmap=cmap, linewidths=0.5, vmin=0.0, vmax = 1.0,
                       annot=annot, fmt='.2f', annot_kws={'size':'10'})
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

In [None]:
# log_feature specific
c = df_all.filter(regex='log_feature').copy()
log_corr = c.corr(min_periods=len(c.columns)/10)
log_dsim = 1.0-np.abs(log_corr)

In [None]:
mask = np.ones(log_dsim.shape,dtype='bool')
mask[np.triu_indices(len(log_dsim))] = False
x = ((log_dsim < corr_thresh) & mask).values.nonzero()
a = zip(x[0], x[1])
for x, y in a:
    i1 = log_dsim.index[x]
    i2 = log_dsim.columns[y]
    strongly_correlated.append((i1, i2))

In [None]:
cmap = sns.light_palette("navy", as_cmap=True, reverse=True)
g = sns.clustermap(data=log_dsim,
                   method='complete',
                   metric='correlation',
                   cmap=cmap,
                   xticklabels=False, yticklabels=False,
                   linewidths=0.0,
                   vmin=0.0,
                   vmax = 1.0)

In [None]:
# log_feature specific
a = '|'.join(['severity_type', 'event_type', 'resource_type', 'log_feature'])
c = df_all.filter(regex=a).copy()
all_corr = c.corr(min_periods=len(c.columns)/10)
all_dsim = 1.0 - np.abs(all_corr)

In [None]:
mask = np.ones(all_dsim.shape,dtype='bool')
mask[np.triu_indices(len(all_dsim))] = False
x = ((all_dsim < 0.03) & mask).values.nonzero()
a = zip(x[0], x[1])
for x, y in a:
    i1 = all_dsim.index[x]
    i2 = all_dsim.columns[y]
    if i1[:5] != i2[:5]:
        strongly_correlated.append((i1, i2))

In [None]:
plt.figure(figsize=(22, 20))
cmap = sns.light_palette("navy", as_cmap=True, reverse=True)
g = sns.clustermap(data=all_dsim,
                   method='complete',
                   metric='correlation',
                   cmap=cmap,
                   xticklabels=False, yticklabels=False,
                   linewidths=0.0,
                   vmin=0.0,
                   vmax = 1.0)

In [None]:
print len(strongly_correlated)
strongly_correlated

The serverity types are mostly anti-correlated except type 1 and 2.

There is interesting structure in the resource and event types. Makes me more curious about how these events are networked, i.e. communicating to each other? Build out as network problem? Can't: id and loc are 1:1 so the only connections will be among events that are the same, in that they have the same coding across all types. I don't see that as informative to a model.

Log features are also clustered.

**But, this is clearly a well-defined classification problem.**

Want to see chains of correlations, e.g. [a,b]...[b,c], does that mean [a,c]? Run this...

In [None]:
# most common features in corr stack
sc_pairs = list(sum(strongly_correlated, ()))
Counter(sc_pairs).most_common(10)

In [None]:
chains = []
for s in sorted(set(sum(strongly_correlated, ()))):
    tmp = []
    for sc in strongly_correlated:
        if s in sc:
            tmp.append(sc)
    
    chains.append(tuple(sorted(list(set(sum(tmp,()))))))

In [None]:
print 'Strong correlations: ', len(strongly_correlated)
print 'Resulting chains: ', len(set(chains))

In [None]:
# for each chain
# create super feature
# find rows where all chain feature values == 1
# set super feature col == 1

a=['log_feature_160', 'log_feature_44']
df = df_train.copy()
df['super'] = np.zeros(len(df_train))
for c in a:
    df.super[(df[c] > 0)] = 1
    
d.sales[d.sales==24] = 100

## Averages

In [None]:
df = df_all[(df_all.is_train==True)].copy()

In [None]:
sorted(df.fault_severity.unique())

In [None]:
names = []
pers = []
avgs = []
stds = []
for a in attributes:
    cnt = len(a)
    for c in filter(lambda s: s[:cnt] == a, df.columns):
        d = df[(df[c]>0)]
        names.append(c)
        pers.append(100.*float(d.shape[0])/float(df.shape[0]))
        avgs.append(d.fault_severity.mean())
        stds.append(d.fault_severity.std())

In [None]:
fs = pd.DataFrame({'%tot':pers, 'avg':avgs, 'std':stds}, index=names).fillna(0)

In [None]:
fsv = [0, 1, 2]
cnt = 0
drop_cols = {}
for n in fsv:
    a = fs[(fs['avg']==n) & (fs['std']==0)]
    drop_cols[str(n)] = list(a.index)
    cnt += a.shape[0]
    print n, a.shape[0]
print "Drop ", cnt, " of ", fs.shape[0]
print "Keeping ", fs.shape[0]-cnt, " columns"

So there are 192 attributes with a set fault severity value and standard dev of zero. So what if I build a model for events where all those features are 0 with those features dropped. So the fitting logic then becomes:

```python
known_fs = {}
for fs, cols in drop_col.iteritems():
    for col in cols:
        for id in df[df[col] > 0]['id']:
            known_fs[id] = fs        
df = pd.get_dummies(known_fs)
```

## Uniqueness

### Event ID

In [None]:
# remove columns not characterizing events
cols = [c for c in df_all.columns if any(c[:5] in a for a in attributes)]
df = df_all[cols]

# ensure all are binary
df[(df != 0)] = 1

# convert binary strings to a 36-bit hashed md5 hex code for easier handling
df = df.apply(lambda x: int(hashlib.md5(''.join([e for e in x.astype(int).astype(str)])).hexdigest(), 36), axis=1)

# attach back
df_all['evt_id'] = df.astype(str)

# save group sizes
dsize = len(df_all)

### Event ID(Fault Severity)

In [None]:
# a stats handler for grouping
def get_stats(group):
    return {'evt_fs_sprd': group.max() - group.min(),
            'evt_cnt': len(group),
            'evt_freq': len(group)/float(dsize),
            'evt_fs_mean': group.mean(),
            'evt_fs_std': group.std()
           }

# build stats for each type of event
events = df_all['fault_severity'].\
            groupby(df_all['evt_id']).\
            apply(get_stats).\
            unstack().\
            sort_values(by='evt_freq', ascending=False).reset_index()

In [None]:
events.drop('evt_id', 1).iplot(kind='histogram', barmode='stack', histnorm='percent')

In [None]:
# categorize the events based on freq
bins = [0, 0.00475, 0.009, 1]
group_names = ['rare', 'sparse', 'common']
events['evt_cat'] = pd.cut(events['evt_freq'], bins, labels=group_names)

In [None]:
events.iplot(kind='bubble', x='mean', y='freq', size='cnt',
             text='hexcode', categories='event_category',
             xTitle='Mean Fault Severity', yTitle='Type Frequency')

### Locations(Fault Severity)

In [None]:
# a stats handler for grouping
def get_stats(group):
    return {'loc_fs_sprd': group.max() - group.min(),
            'loc_cnt': len(group),
            'loc_freq': len(group)/float(dsize),
            'loc_fs_mean': group.mean(),
            'loc_fs_std': group.std()
           }

locations = df_all['fault_severity'].\
                groupby(df_all['location']).\
                apply(get_stats).\
                unstack().\
                sort_values(by='loc_freq', ascending=False).reset_index()

In [None]:
locations.drop('location', 1).iplot(kind='histogram', barmode='stack', histnorm='percent')

In [None]:
# categorize the events based on freq
bins = [0, 0.0013, 0.004, 1]
group_names = ['rare', 'sparse', 'common']
locations['loc_cat'] = pd.cut(locations['loc_freq'], bins, labels=group_names)

In [None]:
locations.iplot(kind='bubble', x='loc_fs_mean', y='loc_freq',
                size='loc_cnt', text='location', categories='location_category',
                xTitle='Mean Fault Severity', yTitle='Location Frequency')

## Culling

In [None]:
df_culled = df_all.copy()
for k, v in drop_cols.iteritems():
    for c in v:
        df_culled = df_culled[df_culled[c] == 0]
        df_culled.drop(c, 1, inplace=True)
print "Removed ", df_all.shape[0] - df_culled.shape[0], " rows"
print "Dropped ", df_all.shape[1] - df_culled.shape[1], " cols"

In [None]:
hdf = pd.HDFStore(hdf_file)
hdf.put('df_culled', df_culled)
hdf.close()

In [None]:
known_fs = {}
for fs, cols in drop_cols.iteritems():
    for col in cols:
        for id in df_all[(df_all[col] > 0) & (df_all.is_train==False)]['id']:
            known_fs[id] = fs
df_known = pd.Series(known_fs, name='predict')
df_known.index.name = 'id'
df_known = pd.get_dummies(df_known, prefix='predict').reset_index()

In [None]:
df_known.head()

In [None]:
check = df_known['id'].head()
for a in check:
    print known_fs[a]

In [None]:
hdf = pd.HDFStore(hdf_file)
hdf.put('df_known', df_known)
hdf.close()

**NB: in the end, the shapes of culled+known = df_all**

In [None]:
assert df_known.shape[0] + df_culled[(df_culled.is_train==False)].shape[0] == df_all[(df_all.is_train==False)].shape[0], "this is wrong"

The known fault severity values are now stored in df_known in the hdf5, so let's predict the ones we don't know. K.I.S.S. == random forest. 

## Merge

In [None]:
train_merge = pd.read_csv('data/train_merge.csv')
test_merge = pd.read_csv('data/test_merge.csv')

In [None]:
# merge loc info and drop dupe cols
locations.location = locations.location.astype('int64')
a = pd.merge(train_merge, locations, how = 'left', left_on = 'numloc', right_on = 'location')
b = pd.merge(df_all[['id', 'evt_id']], events, how = 'left', left_on = 'evt_id', right_on = 'evt_id')
c = pd.merge(a, b, how = 'left', left_on = 'id', right_on = 'id')
c.drop(['loc_nid', 'numloc'], 1, inplace=True)
train_merge = c.copy()

In [None]:
a = pd.merge(test_merge, locations, how = 'left', left_on = 'numloc', right_on = 'location')
b = pd.merge(df_all[['id', 'evt_id']], events, how = 'left', left_on = 'evt_id', right_on = 'evt_id')
c = pd.merge(a, b, how = 'left', left_on = 'id', right_on = 'id')
c.drop(['loc_nid', 'numloc'], 1, inplace=True)
test_merge = c.copy()

In [None]:
# one hot encode
train_merge = pd.get_dummies(train_merge, columns=['loc_cat', 'evt_cat'])
test_merge = pd.get_dummies(test_merge, columns=['loc_cat', 'evt_cat'])

In [None]:
from sklearn.preprocessing import MinMaxScaler
train_merge.location = train_merge[['location']].apply(lambda x: MinMaxScaler().fit_transform(x))
test_merge.location = test_merge[['location']].apply(lambda x: MinMaxScaler().fit_transform(x))

In [None]:
train_merge.to_csv('data/train_master.csv', index=False)
test_merge.to_csv('data/test_master.csv', index=False)

# Scratch

In [21]:
junk = pd.DataFrame({'a':[1,1,True], 'b':[1,1,False], 'c':[0,0,0]})
junk['n0'] = (junk.drop('c', 1) == 0).astype(int).sum(axis=1)