<a href="https://colab.research.google.com/github/AlexanderCardarasUCSC/Tabular-Playground-Series---Jun-2021/blob/main/Tabular_Playground_Series_Jun_2021_%5Bacardara%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Code entry for the **Tabular Playground Series - Jun 2021** kaggle competition.

https://www.kaggle.com/c/tabular-playground-series-jun-2021/overview



Inspiration for the code from the keras functional_api guide.

https://keras.io/guides/functional_api/.


In [1]:
# mount google drive to download dataset
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [3]:
# create a folder to store dataset files
!mkdir /content/dataset

# copy dataset from google drive
!cp /content/gdrive/MyDrive/kaggle/train.csv /content/dataset
!cp /content/gdrive/MyDrive/kaggle/test.csv /content/dataset

mkdir: cannot create directory ‘/content/dataset’: File exists


In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv("/content/dataset/train.csv", header=0)
dataset = dataset.to_numpy()
print(dataset.shape)

testset = pd.read_csv("/content/dataset/test.csv", header=0)
testset = testset.to_numpy()
print(testset.shape)

(200000, 77)
(100000, 76)


In [108]:
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras import Input
from tensorflow.keras import Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

NUMBER_OF_CLASSES = 9

def define_model():
  input_layer = Input(shape=(75,), name="first layer")

  output_layer = Dense(128, activation="relu")(input_layer)
  output_layer = Dense(64, activation="relu")(output_layer)
  output_layer = Dense(10, activation="softmax")(output_layer)
  
  model = Model(inputs=input_layer, outputs=output_layer, name="Simple Classification Model")

  loss = SparseCategoricalCrossentropy(from_logits=True)
  opt = Adam(learning_rate=0.0005, beta_1=0.5)
  model.compile(loss=loss, optimizer=opt, metrics=["accuracy"])
  return model

def string_to_categorical(labels):
  new_labels = []
  for label in labels:
    new_labels.append(int(label[0][6]))
  new_labels = np.asarray(new_labels)
  return to_categorical(new_labels).astype(np.int32)

def generate_training_samples(dataset, n_samples):
  rows = [np.random.randint(0,dataset.shape[0]) for _ in range(n_samples)]
  samples = dataset[rows]

  train_x = samples[:,1:-1]
  train_y = samples[:,-1:]

  train_y = string_to_categorical(train_y)
  return train_x, train_y

def train(dataset, model, n_epochs, batch_size):
  iterations_per_epoch = dataset.shape[0]
  batches_per_epoch = iterations_per_epoch//batch_size
  for epoch in range(n_epochs):
    for batch in range(batches_per_epoch):
        train_x, train_y = generate_training_samples(dataset, batch_size)

        _, error = model.train_on_batch(train_x, train_y)

        # print metrics every once in a while
        if batch % 100 == 0:
          print(">%d %d/%d, %.3f"%(epoch, batch, batches_per_epoch, error))



In [109]:
n_epochs = 100
batch_size = 1

model = define_model()
model.summary()
train(dataset, model, n_epochs, batch_size)

Model: "Simple Classification Model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
first layer (InputLayer)     [(None, 75)]              0         
_________________________________________________________________
dense_24 (Dense)             (None, 128)               9728      
_________________________________________________________________
dense_25 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_26 (Dense)             (None, 10)                650       
Total params: 18,634
Trainable params: 18,634
Non-trainable params: 0
_________________________________________________________________


ValueError: ignored

In [106]:
train_x, train_y = generate_training_samples(dataset, 10)

In [107]:
print(train_x)
print(train_y)


[[0 0 0 14 0 2 0 0 0 5 3 12 2 0 0 0 0 0 0 7 1 0 0 1 1 0 0 1 0 0 0 0 0 2 0
  0 0 0 0 0 0 1 1 1 2 2 1 0 0 0 2 0 0 0 13 0 0 0 0 0 0 0 3 0 0 1 0 0 0 0
  0 0 0 0 0]
 [0 6 0 1 0 2 0 1 2 0 0 0 7 1 4 71 0 0 2 48 6 1 0 0 0 2 1 0 1 20 0 0 0 18
  2 0 0 4 0 4 0 0 27 3 0 0 0 0 0 0 8 0 0 5 0 0 1 0 0 1 0 0 0 2 0 23 0 0 2
  1 0 0 0 2 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 2 0 0 2 0 0 0 0 0 0
  0 3 0 6 0 3 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
  0 0 0]
 [0 0 0 1 0 2 1 0 0 1 0 0 2 0 2 0 0 0 2 0 0 0 0 2 2 0 0 1 1 0 1 0 0 1 0 0
  0 2 0 0 0 0 1 2 0 1 0 0 0 0 4 0 1 0 0 1 0 2 0 0 6 1 0 0 0 0 0 0 0 1 0 2
  0 1 0]
 [0 1 4 0 32 0 2 0 1 0 0 2 7 0 0 6 0 0 15 28 1 1 0 0 0 0 0 0 0 2 3 4 3 1
  0 1 0 1 0 0 0 4 0 9 0 0 2 0 1 0 4 0 0 1 0 1 10 1 4 0 2 0 0 0 0 2 0 7 0
  2 0 0 0 0 0]
 [0 0 0 0 0 2 0 0 0 0 6 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0
  0 0 0 2 0 0 0 0 1 0 1 0 0 0 0 0 0 0 3 0 0 0 0 0 0 1 7 0 0 0 0 4 1 0 0 0
  0 0 0]
 [0 2 13 3 0 0 0 1 0 0 0 0 1 1 2 0 1 0 5 0 0 

In [79]:
print(train_y[:,0])
test = train_y[:,0]

['Class_8' 'Class_6' 'Class_8' 'Class_2' 'Class_8' 'Class_9' 'Class_9'
 'Class_2' 'Class_6' 'Class_2']
