In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.optimizers import SGD

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Load data

In [2]:
test_boxes = pd.read_csv('tbl_test_boxes.csv')
train_boxes = pd.read_csv('tbl_train_boxes.csv')
unboxer_attributes = pd.read_csv('tbl_unboxer_attributes.csv')
unboxer_keywords = pd.read_csv('tbl_unboxer_keywords.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
vendor_keywords = pd.read_csv('tbl_vendor_keywords.csv')

### Look at the data 

In [4]:
def check_for_nans_by_column(df):
    for col in df.columns:
        n_nans = len(df[df[col].isna()==True])
        print('{}: {} nans'.format(col, n_nans))

In [5]:
# simple enough
#train_boxes

In [6]:
check_for_nans_by_column(train_boxes)

unboxer_id: 0 nans
vendor_id: 0 nans
target_bobcat: 0 nans


In [7]:
# same
#test_boxes

In [8]:
check_for_nans_by_column(test_boxes)

unboxer_id: 0 nans
vendor_id: 0 nans
target_bobcat: 116837 nans


In [9]:
# will need to decide how to handle nans
# beartrap variables are categorical - will need to one-hot-encode

# it seems that skimask features can be 0, 1, or -1
# I assume these are meant to mean null, positive, or negative
# so then it would make sense to replace nulls with 0s
# then I could one hot encode 

#unboxer_attributes

In [10]:
check_for_nans_by_column(unboxer_attributes)

unboxer_id: 0 nans
feature_skimask1: 6004 nans
feature_skimask2: 5199 nans
feature_skimask3: 5090 nans
feature_skimask4: 5971 nans
feature_skimask5: 4770 nans
feature_skimask6: 6109 nans
feature_skimask7: 4786 nans
feature_skimask8: 6088 nans
feature_skimask9: 5761 nans
feature_skimask10: 4667 nans
feature_skimask11: 6239 nans
feature_skimask12: 5443 nans
feature_skimask13: 5622 nans
feature_skimask14: 5525 nans
feature_skimask15: 6207 nans
feature_skimask16: 5788 nans
feature_skimask17: 4805 nans
feature_skimask18: 5486 nans
feature_skimask19: 4828 nans
feature_skimask20: 4979 nans
feature_skimask21: 5575 nans
feature_skimask22: 5830 nans
feature_skimask23: 4957 nans
feature_skimask24: 6297 nans
feature_skimask25: 5706 nans
feature_skimask26: 5132 nans
feature_skimask27: 5058 nans
feature_skimask28: 6191 nans
feature_skimask29: 5305 nans
feature_skimask30: 5414 nans
feature_skimask31: 5866 nans
feature_skimask32: 5455 nans
feature_skimask33: 5607 nans
feature_skimask34: 5278 nans
feat

In [11]:
# categorical - simple enough
# need to switch enabled to 0s and 1s
#unboxer_keywords

In [12]:
check_for_nans_by_column(unboxer_keywords)

unboxer_id: 0 nans
keyword_id: 0 nans
enabled: 0 nans


In [13]:
# again, need to deal with nans - how many rows are all nan?
# need to switch enabled to 0s and 1s
# vendor_keywords

In [14]:
# so it seems that we can just discard any rows with a nan
check_for_nans_by_column(vendor_keywords)

vendor_id: 614822 nans
keyword_id: 614822 nans
enabled: 614822 nans


### Clean the data 
Let's move backwards on this one

In [None]:
# drop rows with nans
vendor_keywords = vendor_keywords.dropna()

# replace -1s with 0s
vendor_keywords = vendor_keywords.replace(-1, 0)

# SUMBSAMPLE FOR NOW BECAUSE IM IMPATIENT: TODO: FIX THIS LATER
#print(len(vendor_keywords))
#vendor_keywords = vendor_keywords.sample(frac=0.01, random_state=42)
#print(len(vendor_keywords))

# need to one hot encode
vendor_keywords['keyword_id'] = vendor_keywords['keyword_id'].astype('category')
vendor_keywords = pd.get_dummies(vendor_keywords)
cols_to_multiply = vendor_keywords.columns.to_list()[2:]
print('multiplying...')
vendor_keywords[cols_to_multiply] = vendor_keywords[cols_to_multiply].multiply(vendor_keywords['enabled'], axis='index')

In [None]:
# vendor_keywords

In [None]:
# need to aggregate by user_id
print('aggregating...')
vendor_keywords_agg = vendor_keywords.groupby('vendor_id').sum()
vendor_keywords_agg = vendor_keywords_agg.drop(columns='enabled')
vendor_keywords_agg

In [None]:
# replace -1s with 0s
unboxer_keywords = unboxer_keywords.replace(-1, 0)

# SUMBSAMPLE FOR NOW BECAUSE IM IMPATIENT: TODO: FIX THIS LATER
#print(len(unboxer_keywords))
#unboxer_keywords = unboxer_keywords.sample(frac=0.5, random_state=42)
#print(len(unboxer_keywords))

# need to one hot encode
unboxer_keywords['keyword_id'] = unboxer_keywords['keyword_id'].astype('category')
unboxer_keywords = pd.get_dummies(unboxer_keywords)
cols_to_multiply = unboxer_keywords.columns.to_list()[2:]
print('multiplying...')
unboxer_keywords[cols_to_multiply] = unboxer_keywords[cols_to_multiply].multiply(unboxer_keywords['enabled'], axis='index')

# need to aggregate by user_id
print('aggregating...')
unboxer_keywords_agg = unboxer_keywords.groupby('unboxer_id').sum()
unboxer_keywords_agg = unboxer_keywords_agg.drop(columns='enabled')
unboxer_keywords_agg

In [None]:
### replace nans with 0s 
unboxer_attributes = unboxer_attributes.fillna(0)

# now everything can be one-hot-encoded
# going to have so many columns... can definitely remove correlated/insignificant ones later if needed

# first we need to make all of the columns categorical
def make_all_cols_categorical(df):
    for col in df.columns:
        if col != 'unboxer_id':
            df[col] = df[col].astype('category')
    return(df)
unboxer_attributes = make_all_cols_categorical(unboxer_attributes)
# do the one hot encoding
unboxer_attributes = pd.get_dummies(unboxer_attributes)

unboxer_attributes

In [None]:
# test_boxes and train_boxes are fine for now

### Combine the tables to make one big dataset

In [None]:
# let me subsample to make things run quicky for now
# when I have a model I can remove insignificant/correlated features and use the whole dataset
train_boxes_subsampled = train_boxes.sample(frac=.5, random_state=42)

In [None]:
#train_boxes_subsampled

In [None]:
print('So I will expect to see maximum {} rows throughout this whole merging process'.format(len(train_boxes_subsampled)))

In [None]:
print(len(unboxer_keywords_agg), len(train_boxes_subsampled), len(unboxer_keywords_agg)*len(train_boxes_subsampled))

In [None]:
# join the unboxer keywords

# left.merge(right, on='user_id', how='left')
train_full_data = pd.merge(train_boxes_subsampled, unboxer_keywords_agg, on='unboxer_id', how='left')
# TODO: remove this later
# since I sampled the unboxer_keywords df, there are some unboxers which are not represented. 
# Drop their rows for now
# train_full_data = train_full_data.dropna()
# train_full_data

In [None]:
# join the vendor keywords 
train_full_data = pd.merge(train_full_data, vendor_keywords_agg, on='vendor_id', how='left')
#train_full_data

In [None]:
# join unboxer attributes
train_full_data = pd.merge(train_full_data, unboxer_attributes, on='unboxer_id', how='left')
# train_full_data

### Convert dataframe to features and targets arrays, split train and test, deal with unbalanced classes

In [None]:
# I need to deal with unbalanced classes- we want an equal number of bobcats and not bobcats

# Separate majority and minority classes
df_majority = train_full_data[train_full_data.target_bobcat==0]
df_minority = train_full_data[train_full_data.target_bobcat==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,                  # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=42)               # reproducible results
 
# Combine majority class with upsampled minority class
train_full_data = pd.concat([df_majority, df_minority_upsampled])

# remove the unneccessary columns
train_full_data = train_full_data.drop(columns=['unboxer_id', 'vendor_id'])

In [None]:
# separate into train and test sets
train, test = train_test_split(train_full_data, test_size=0.2)

In [None]:
# split features from targets, convert to array
x_train = train.drop(columns='target_bobcat').values
y_train = train['target_bobcat'].values
x_test = test.drop(columns='target_bobcat').values
y_test = test['target_bobcat'].values

In [None]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [None]:
# sanity check- no nans in input?
def check_any_nans(df):
    return df.isnull().any() 

for df in [train, test]:
    print(check_any_nans(df))

### Define and train a model 
If 7.5% of boxes contain a bobcat, then if my model is choosing randomly it would get accuracy=0.075. This is the number to beat for a performant model. 

In [None]:
model = Sequential()
# start with something very simple
model.add(Dense(500, input_dim=x_train.shape[1], activation='relu', kernel_initializer='he_uniform'))  # input layer
model.add(Dense(200, activation='relu'))
#model.add(Dropout(0.2))
#model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid')) # sigmoid to choose 1 or 0

In [None]:
opt = SGD(lr=0.01, momentum=0.9)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3, batch_size=32)
model.summary()

### Evaluate the model  