In [0]:
!pip install keras



In [0]:
!pip install sklearn



In [1]:
import csv
import gzip
###
import numpy as np
###
from keras.layers.advanced_activations import PReLU
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.models import Sequential
from keras.utils import np_utils
from sklearn import metrics
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler

Using TensorFlow backend.


In [2]:
#function to read data row-wise
def get_data(fn):
  """
  Function to read data row-wise
  
  Input: 
  fn - File location as a string
  Output: 
  data - List where each element contains the row from the data
  """
  data = []
  with open(fn) as f:
    reader = csv.DictReader(f)
    data = [row for row in reader]
  return data

In [25]:
def get_fields(data, fields):
  """
  Function to read contents from the row and convert them into a numpy array
  
  Input: 
  data - List where each element contains the row from the data
  fields - Dictionary containing the function definitions for each key in it
  Output: 
  extracted - List of numpy arrays, each entry corresponding to a row in data
  """
  #print fields
  extracted = []
  for row in data:
    extract = []
    for field, f in sorted(fields.items()):
      #print field,f
      info = f(row[field])
      if type(info) == list:
        extract.extend(info)
      else:
        extract.append(info)
    extracted.append(np.array(extract, dtype=np.float32))
  return extracted




In [28]:
def preprocess_data(X, scaler=None):
  if not scaler:
    scaler = StandardScaler()
    scaler.fit(X)
  X = scaler.transform(X)
  return X, scaler

In [19]:
def shuffle(X, y, seed=1331):
  '''
  Function to shuffle the contents of lists X and y
  based on a fixed seed
  
  Input: 
  X - List input
  y - List input
  seed - seed for shuffling
  
  Output: 
  Shuffled lists X and y 
  '''
  np.random.seed(seed)
  shuffle = np.arange(len(y))
  np.random.shuffle(shuffle)
  X = X[shuffle]
  y = y[shuffle]
  return X, y

In [20]:
def dating(x):
  '''
  Function to extract the date-time entry string into a list containing the
  date, time elements
  
  Input: 
  x - String containing data and time
  
  Output: 
  List containing [Year, month, day, time]
  '''
  date, time = x.split(' ')
  y, m, d = map(int, date.split('-'))
  time = time.split(':')[:2]
  time = int(time[0]) * 60 + int(time[1])
  return [y, m, d, time]

In [23]:
# Global Variables
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 
        'Friday', 'Saturday', 'Sunday']
districts = ['BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN',
             'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN']
labels = 'ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS'.split(',')

data_fields = {
    'X': lambda x: float(x),
    'Y': lambda x: float(x),
    'DayOfWeek': lambda x: days.index(x) / float(len(days)),
    'Address': lambda x: [1 if 'block' in x.lower() else 0],
    'PdDistrict': lambda x: [1 if x == d else 0 for d in districts],
    'Dates': dating,  # Parse 2015-05-13 23:53:00
}

label_fields = {'Category': lambda x: labels.index(x.replace(',', ''))}

In [29]:
# Loading data in the format ready for training the Neural Network

print('Loading training data...')
raw_train = get_data('./input/train.csv')
print('Creating training data...')
X = np.array(get_fields(raw_train, data_fields), dtype=np.float32)
print('Creating training labels...')
y = np.array(get_fields(raw_train, label_fields))
del raw_train

X, y = shuffle(X, y)
X, scaler = preprocess_data(X)
Y = np_utils.to_categorical(y)

input_dim = X.shape[1]
output_dim = len(labels)
print('Input dimensions: {}'.format(input_dim))

Loading training data...
Creating training data...
Creating training labels...
Input dimensions: 18


In [62]:
# Fixing hyperparameters
EPOCHS = 1
BATCHES = 128
HN = 39
RUN_FOLDS = False
nb_folds = 4
kfolds = KFold(len(y), nb_folds)
av_ll = 0.
f = 0
# Running the model using kfold cross validation
if RUN_FOLDS:
  for train, valid in kfolds:
      print('---' * 20)
      print('Fold', f)
      print('---' * 20)
      f += 1
      X_train = X[train]
      X_valid = X[valid]
      Y_train = Y[train]
      Y_valid = Y[valid]
      y_valid = y[valid]

      print("Building model...")
      model = build_model(input_dim, output_dim, HN)

      print("Training model...")

      model.fit(X_train, Y_train, nb_epoch=EPOCHS, batch_size=BATCHES, validation_data=(X_valid, Y_valid), verbose=0)
      valid_preds = model.predict_proba(X_valid)
      ll = metrics.log_loss(y_valid, valid_preds)
      print("LL:", ll)
      av_ll += ll
  print('Average LL:', av_ll / nb_folds)

In [63]:
def build_model(input_dim, output_dim, hn=32, dp=0.5, layers=1):
    '''
    Function to build the convolutional nueral network with hyperparameters
    which are given by the function call

    Input: 
    input_dim - Dimensions of the input layer
    output_dim - Dimensions of the output layer
    hn - Number of units in the nueral network
    dp - Dropout probabilities
    layers - Number of layers present in the network
    
    Output: 
    model - Fully constructed model based on the specified parameters 
    '''
    model = Sequential()
    model.add(Dense(hn,input_shape = (input_dim,),init='glorot_uniform'))
    model.add(PReLU(input_shape=(hn,)))
    model.add(Dropout(dp))

    for i in range(layers):
      model.add(Dense(hn, input_shape = (hn,), init='glorot_uniform'))
      model.add(PReLU(input_shape=(hn,)))
      model.add(BatchNormalization(input_shape=(hn,)))
      model.add(Dropout(dp))

    model.add(Dense(hn, input_shape=(output_dim,), init='glorot_uniform'))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [65]:
print("Generating submission...")

model = build_model(input_dim, output_dim, HN)
model.fit(X, Y, nb_epoch=EPOCHS, batch_size=BATCHES, verbose=0)

print('Loading testing data...')
raw_test = get_data('./input/test.csv')
print('Creating testing data...')
X_test = np.array(get_fields(raw_test, data_fields), dtype=np.float32)
del raw_test
X_test, _ = preprocess_data(X_test, scaler)

print('Predicting over testing data...')
preds = model.predict_proba(X_test, verbose=0)

with gzip.open('sf-nn.csv.gz', 'wt') as outf:
  fo = csv.writer(outf, lineterminator='\n')
  fo.writerow(['Id'] + labels)

  for i, pred in enumerate(preds):
    fo.writerow([i] + list(pred))

Generating submission...




Loading testing data...
Creating testing data...
Predicting over testing data...
