In [115]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import tensorflow as tf

In [116]:
train = pd.read_csv('Dataset/train.csv')
test = pd.read_csv('Dataset/test.csv')

In [117]:
print(train.head(5))

       pet_id           issue_date         listing_date  condition  \
0  ANSL_69903  2016-07-10 00:00:00  2016-09-21 16:25:00        2.0   
1  ANSL_66892  2013-11-21 00:00:00  2018-12-27 17:47:00        1.0   
2  ANSL_69750  2014-09-28 00:00:00  2016-10-19 08:24:00        NaN   
3  ANSL_71623  2016-12-31 00:00:00  2019-01-25 18:30:00        1.0   
4  ANSL_57969  2017-09-28 00:00:00  2017-11-19 09:38:00        2.0   

    color_type  length(m)  height(cm)  X1  X2  breed_category  pet_category  
0  Brown Tabby       0.80        7.78  13   9             0.0             1  
1        White       0.72       14.19  13   9             0.0             2  
2        Brown       0.15       40.90  15   4             2.0             4  
3        White       0.62       17.82   0   1             0.0             2  
4        Black       0.50       11.06  18   4             0.0             1  


In [118]:
train = np.array(train)
test = np.array(test)

In [119]:
# both train and test are missing a lot of data in the 'condition' column, so we're going to
# fill in the missing data with imputer

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(train[:,3:4])
train[:,3:4] = imputer.transform(train[:,3:4])
imputer.fit(test[:,3:4])
test[:,3:4] = imputer.transform(test[:,3:4])


print(type(train[0,4]))
print(train[0,4])
print(train[1,4])
print(train[2,4])
print(train[0,:]) # the one in the first 5 missing 'condition[2]'


# this might not work bc condition is a 0.0, 1.0, 2.0, etc.... and the mean is apparently
# 0.8833899867488622

<class 'str'>
Brown Tabby
White
Brown
['ANSL_69903' '2016-07-10 00:00:00' '2016-09-21 16:25:00' 2.0
 'Brown Tabby' 0.8 7.78 13 9 0.0 1]


In [122]:
# the next step is to encode categorical data (most categorical data already has been, but for
# whatever reason 'color_type'[4] hasn't)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4])], 
                       remainder='passthrough')
                       #sparse=False)
#train = np.array(ct.fit_transform(train))
#test = np.array(ct.fit_transform(test))

print(type(train[0,4]))
print(train[0,4])
print(train[1,4])
print(train[2,4])
print(train[0,:])

#this block is correct, getting weird errors when I try to fit_transform the CT

# might need to convert dates/times to more numeric values

<class 'str'>
Brown Tabby
White
Brown
['ANSL_69903' '2016-07-10 00:00:00' '2016-09-21 16:25:00' 2.0
 'Brown Tabby' 1.0300208994241908 -1.5107285279864824 1.1610133636188338
 1.2572791543898296 0.0 1]


In [123]:
# after we filled in missing data and encoded categorical data, we're going to scale everything
# using the standardization method
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# gonna want to standardize the columns:
# condition[3] ?
# length[5]
# height[6]
# X1[7]
# X2[8]
# not sure if standardizing the dates would be helpful
print(train[0,:])
train[:, 5:9] = sc.fit_transform(train[:, 5:9])
test[:, 5:9] = sc.transform(test[:,5:9])
print(train[0,:])

['ANSL_69903' '2016-07-10 00:00:00' '2016-09-21 16:25:00' 2.0
 'Brown Tabby' 1.0300208994241908 -1.5107285279864824 1.1610133636188338
 1.2572791543898296 0.0 1]
['ANSL_69903' '2016-07-10 00:00:00' '2016-09-21 16:25:00' 2.0
 'Brown Tabby' 1.0300208994241906 -1.5107285279864828 1.161013363618834
 1.2572791543898296 0.0 1]


In [125]:
# splitting the cleaned data into input/output
# don't need pet_id, issue_date, listing_date, condition?
# what about anonymous columns?
X_train = train[:,0:9]
y_trainB = train[:,9]
y_trainP = train[:,10]
print(y_trainB)
print(y_trainP) # this checks out

[0.0 0.0 2.0 ... 1.0 1.0 1.0]
[1 2 4 ... 1 2 2]


In [109]:
# Should be ready to start building our model
# I'm thinking i'm going to use two different models, one to predict the pet_category, and one
# to take pet_category into account and predict breed_category

In [110]:
# here are our options
# Logistic Regression -> more than two potential labels in both outputs, so not usable
# KNN ->
# Support Vector Machine ->
# Naive Bayes -> 
# Decision Tree ->
# Random Forest ->
# ANN -> 

In [14]:
# model 1
ann = tf.keras.models.Sequential()
ann.add(tf.keras.layers.Dense(units=6, activation='relu')) # INPUT LAYER HANDLED AUTOMATICALLY
# Dense is just output = activation(dot(input, kernel)  + bias)
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))
ann.add(tf.keras.layers.Dense(units=4, activation='softmax'))
# our first model is going to be predicting the pet_category, because the result will have a
# direct influence on the breed_category. There are 4 categories, so we'll need 4 output
# neurons for a onehotencoding type beat
# the sigmoid activation function gives the probabilities of a binary output
# for more than two categories use softmax

['ANSL_69903' '2016-07-10 00:00:00' '2016-09-21 16:25:00' 2.0
 'Brown Tabby' 1.0300208994241908 -1.5107285279864824 13 9]


In [None]:
# compiling model 
ann.compile(optimizer='adam', loss='categorical_crossentropy', metrics= ['accuracy'])

In [126]:
# training model
ann.fit(X_train, y_trainP, batch_size=32, epochs=50)

Instructions for updating:
Colocations handled automatically by placer.


RuntimeError: You must compile a model before training/testing. Use `model.compile(optimizer, loss)`.