In [196]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import tensorflow as tf

In [197]:
train = pd.read_csv('Dataset/train.csv')
test = pd.read_csv('Dataset/test.csv')

In [198]:
print(train.head(5))

       pet_id           issue_date         listing_date  condition  \
0  ANSL_69903  2016-07-10 00:00:00  2016-09-21 16:25:00        2.0   
1  ANSL_66892  2013-11-21 00:00:00  2018-12-27 17:47:00        1.0   
2  ANSL_69750  2014-09-28 00:00:00  2016-10-19 08:24:00        NaN   
3  ANSL_71623  2016-12-31 00:00:00  2019-01-25 18:30:00        1.0   
4  ANSL_57969  2017-09-28 00:00:00  2017-11-19 09:38:00        2.0   

    color_type  length(m)  height(cm)  X1  X2  breed_category  pet_category  
0  Brown Tabby       0.80        7.78  13   9             0.0             1  
1        White       0.72       14.19  13   9             0.0             2  
2        Brown       0.15       40.90  15   4             2.0             4  
3        White       0.62       17.82   0   1             0.0             2  
4        Black       0.50       11.06  18   4             0.0             1  


In [199]:
train = np.array(train)
test = np.array(test)

In [200]:
# both train and test are missing a lot of data in the 'condition' column, so we're going to
# fill in the missing data with imputer

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(train[:,3:4])
train[:,3:4] = imputer.transform(train[:,3:4])
imputer.fit(test[:,3:4])
test[:,3:4] = imputer.transform(test[:,3:4])

# this might not work bc condition is a 0.0, 1.0, 2.0, etc.... and the mean is apparently
# 0.8833899867488622

'''
print(type(train[0,4]))
print(len(train[0,:]))
print(train[0,4])
print(train[1,4])
print(train[2,4])
print(train[0,:]) # the one in the first 5 missing 'condition[2]'
'''

"\nprint(type(train[0,4]))\nprint(len(train[0,:]))\nprint(train[0,4])\nprint(train[1,4])\nprint(train[2,4])\nprint(train[0,:]) # the one in the first 5 missing 'condition[2]'\n"

In [201]:
# after we filled in missing data and encoded categorical data, we're going to scale everything
# using the standardization method
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

# gonna want to standardize the columns:
# condition[3] ? -> [59]
# length[5] -> [60]
# height[6] -> [61]
# X1[7] ->[62]
# X2[8] -> [63]
# not sure if standardizing the dates would be helpful
print(train[0,:])
print('=======================')
train[:, 5:9] = sc.fit_transform(train[:, 5:9])
test[:, 5:9] = sc.transform(test[:,5:9])
train = train[:, 4:11]
test = test[:, 4:9]
print(train[0,:])

['ANSL_69903' '2016-07-10 00:00:00' '2016-09-21 16:25:00' 2.0
 'Brown Tabby' 0.8 7.78 13 9 0.0 1]
['Brown Tabby' 1.0300208994241908 -1.5107285279864824 1.1610133636188338
 1.2572791543898296 0.0 1]


In [202]:
# the next step is to encode categorical data (most categorical data already has been, but for
# whatever reason 'color_type'[4] hasn't)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

#print(train[0,:])
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], 
                       remainder='passthrough',
                       sparse_threshold=0)
train = np.array(ct.fit_transform(train))
test = np.array(ct.fit_transform(test))


print(len(train[0,:]))
print(train[0,:])
print(train[0,56])
print(train[0,57])
print(train[0,58])
print(train[0,59])
print(train[0,60])
print(train[0,61])

#this block is correct, getting weird errors when I try to fit_transform the CT

# might need to convert dates/times to more numeric values

62
[0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 0.0 0.0 1.0300208994241908 -1.5107285279864824 1.1610133636188338
 1.2572791543898296 0.0 1]
1.0300208994241908
-1.5107285279864824
1.1610133636188338
1.2572791543898296
0.0
1


In [204]:
# splitting the cleaned data into input/output
# don't need pet_id, issue_date, listing_date, condition?
# what about anonymous columns?
X_train = train[:,0:60]
y_trainB = train[:,60]
y_trainP = train[:,61]
print(y_trainB)
print(y_trainP) # this checks out
le = LabelEncoder()
y_trainB = le.fit_transform(y_trainB)
y_trainP = le.fit_transform(y_trainP)
print(y_trainB)
print(y_trainP)
pets = []
breeds = []
for i in range(len(y_trainB)):
    breed = y_trainB[i]
    pet = y_trainP[i]
    
    if breed in breeds:
        pass
    else:
        breeds.append(breed)
    
    if pet in pets:
        pass
    else:
        pets.append(pet)

print(pets)
print(breeds)

[0.0 0.0 2.0 ... 1.0 1.0 1.0]
[1 2 4 ... 1 2 2]
[0 0 2 ... 1 1 1]
[1 2 3 ... 1 2 2]
[1, 2, 3, 0]
[0, 2, 1]


In [173]:
# Should be ready to start building our model
# I'm thinking i'm going to use two different models, one to predict the pet_category, and one
# to take pet_category into account and predict breed_category

In [174]:
# here are our options
# Logistic Regression -> more than two potential labels in both outputs, so not usable
# KNN ->
# Support Vector Machine ->
# Naive Bayes -> 
# Decision Tree ->
# Random Forest ->
# ANN -> 

In [205]:
# model 1
annP = tf.keras.models.Sequential()
annP.add(tf.keras.layers.Dense(units=6, activation='relu')) # INPUT LAYER HANDLED AUTOMATICALLY
# Dense is just output = activation(dot(input, kernel)  + bias)
annP.add(tf.keras.layers.Dense(units=6, activation='relu'))
annP.add(tf.keras.layers.Dense(units=4, activation='softmax')) # 0, 1, 2, 3
# our first model is going to be predicting the pet_category, because the result will have a
# direct influence on the breed_category. There are 4 categories, so we'll need 4 output
# neurons for a onehotencoding type beat.
# the sigmoid activation function gives the probabilities of a binary output
# for more than two categories use softmax

In [206]:
# compiling model 
annP.compile(optimizer='adam', loss='categorical_crossentropy', metrics= ['accuracy'])

In [207]:
# training model
annP.fit(X_train, y_trainP, batch_size=32, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f947fc20b38>

In [None]:
# so these results are ass (label encoding the output helped), to fix them im thinking:
# 1. check all numbers and indexes
# 2. try to fix the sparse matrix situation


In [None]:
'''
annB = tf.keras.models.Sequential()
annB.add(tf.keras.layers.Dense(units=6, activation='relu'))
annB.add(tf.keras.layers.Dense(units=6, activation='relu'))
annB.add(tf.keras.layers.Dense(units=3, activation='relu')) #(breed = 0.0, 0.1, 0.2)
annB.compile(optimizer='adam', loss='categorical_crossentropy', metrics= ['accuracy'])
annB.fit(X_train, y_trainP, batch_size=32, epochs=50)
'''