# To start we will import a few of the packages we will need

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.io as pio

In [None]:
from glob import glob # this will help us download the data in order to visualize it

In [None]:
#These functions will be vital in making the CNN
import tensorflow as tf 
from tensorflow import keras
from keras import models
from keras import layers

In [None]:
#For some image processing
import keras.preprocessing.image as kpi

In [None]:
#This will be important for processing the metadata
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import mean_squared_error

# With our packages imported we can now need to get the data into the notebook

In [None]:
#Starting with down loading the meta data
data_location = '/fs/ess/PAS2038/PHYSICS5680_OSU/student_data/armitage'
Meta = pd.DataFrame()
Meta_name = data_location + '/train.csv'
Meta = pd.read_csv(Meta_name,header=0)

In [None]:
#Then the image data
image_location = data_location + '/train/*.jpg'
images = glob(image_location)

In [None]:
Meta['Random'] = np.random.randint(0,1000,size = len(Meta))

In [None]:
#To test we want to consolidate these two into a single data frame
Meta['Path'] = images
print(Meta['Id'][0])
Meta['Path'][0]

In [None]:
from sklearn.utils import shuffle
Meta_shuffled = shuffle(Meta, random_state=1)

In [None]:
Meta.head()

In [None]:
Meta.corr()

# With the data imported, lets visualize it a bit to make sure it is working

In [None]:
for i in range(3):
    plt.subplot(1,3,i+1)
    show_img = plt.imread(images[i])
    plt.imshow(show_img)

# Excellent, now to resize the images

In [None]:
#As we can see each of these above images are different sizes so we need to resize them
from PIL import Image

reshaped_images = []

for i in range(len(Meta_shuffled)):
    pillow_image = Image.open(Meta_shuffled['Path'][i])
    reshaped_images.append(pillow_image.resize((64,64)))

In [None]:
#Normalizing them
image_array = []
for i in range(len(reshaped_images)):
    image_array.append(np.array(reshaped_images[i])/256)


# Perfect, looks like it was installed, so lets split these data sets into a test and train. We also need to resize these images to be able to work well with them

In [None]:
from sklearn.model_selection import train_test_split
#Splitting into X(Features) Y(Pawpularity)
X_Meta = Meta_shuffled.iloc[:,1:13]
Y_Meta = Meta_shuffled['Pawpularity']


X_train,X_test,Y_train,Y_test = train_test_split(X_Meta,Y_Meta.values, test_size=0.2, shuffle = False)
image_train,image_test,iy_train,iy_test = train_test_split(image_array,Y_Meta.values,test_size = 0.2, shuffle = False)

In [None]:
#double checking that it worked
for i in range(6):
    plt.subplot(1,6,i+1)
    plt.imshow(image_train[i])
    plt.title(iy_train[i])

In [None]:
image_test = np.array(image_test)
image_train = np.array(image_train)
print(type(image_test))


In [None]:
y_test = []
y_train = []
for i in range(len(Y_test)):
    y_test.append([Y_test[i]])
for i in range(len(Y_train)):
    y_train.append([Y_train[i]])
y_test = np.array(y_test)
y_train = np.array(y_train)
print(y_test[0])
print(y_train.shape)

# Now we can start building our models

In [None]:
#Starting with the CNN
CNN = models.Sequential()
#
# First convolutional layer
CNN.add(layers.Conv2D(30,(5,5),activation='relu',input_shape=(64,64,3)))

#Batch normalize, word on the street is, its pretty cool and helps keep from overfitting
CNN.add(layers.BatchNormalization())

#To prevent overfitting we will also use dropout cutting 20% of neurons
CNN.add(layers.Dropout(0.2))

# Pool
CNN.add(layers.MaxPooling2D((2,2)))

#Normalize again
CNN.add(layers.BatchNormalization())

# Second convolutional layer
CNN.add(layers.Conv2D(25,(5,5),activation='relu'))

#Shave off a few more braincells(neurons) here
CNN.add(layers.Dropout(0.2))

#Hi I'm Normal
CNN.add(layers.BatchNormalization())

# Pool
CNN.add(layers.MaxPooling2D((2,2)))

#Normal is immune to ghost, but has a weakness to fighting
CNN.add(layers.BatchNormalization())

#Layer number 3
CNN.add(layers.Conv2D(32,(3,3),activation='relu'))

# Connect to a dense output layer - just like an FCN
CNN.add(layers.Flatten())
CNN.add(layers.Dense(64,activation='relu'))
CNN.add(layers.Dense(32,activation= 'relu'))
CNN.add(layers.Dense(1,activation = 'relu'))

#compiling the model
#we choose means square error as that is the metric for the contest
CNN.compile(optimizer='adam',loss='mean_squared_error',metrics=[keras.metrics.RootMeanSquaredError()])
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss',patience = 10)]
print(CNN.summary())
CNN_results = CNN.fit(image_train,iy_train,
                          epochs=100,
                          batch_size=256,
                          callbacks=callbacks, # Early stopping
                          validation_data=(image_test,iy_test)
                     )


CNN.save('CNN_Pets')

print(CNN.summary())


In [None]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'

df_cnn = pd.DataFrame(CNN_results.history)
df_cnn['iteration'] = df_cnn.index + 1
#
#
# RMSE
fig = px.line(df_cnn, x='iteration', y=['root_mean_squared_error','val_root_mean_squared_error'], title='RMSE vs Iteration')

#
newnames = {'root_mean_squared_error':'RMSE', 'val_root_mean_squared_error': 'Val_RMSE'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                      legendgroup = newnames[t.name],
                                      hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                     )
                  )
fig.show()


# Loss
fig = px.line(df_cnn, x='iteration', y=['loss','val_loss'], title='Loss vs Iteration')
fig.show()



In [None]:
CNN_results.history 


# This next cell will check the model against the given test data or our split test set.

In [None]:
Meta_test = pd.DataFrame()
Meta_name_test = data_location + '/test.csv'
Meta_test = pd.read_csv(Meta_name_test,header=0)

test_image_location = data_location + '/test/*.jpg'
test_images = glob(test_image_location)

In [None]:
for i in range(3):
    plt.subplot(1,3,i+1)
    show_test_img = plt.imread(test_images[i])
    plt.imshow(show_test_img)

In [None]:
reshaped_test_images = []

for i in range(len(test_images)):
    pillow_test_image = Image.open(test_images[i])
    reshaped_test_images.append(pillow_test_image.resize((64,64)))

In [None]:
test_image_array = []
for i in range(len(reshaped_test_images)):
    test_image_array.append(np.array(reshaped_test_images[i])/256)

In [None]:
test_image_array = np.array(test_image_array)

In [None]:
CNN_predict = CNN.predict(image_test)
CNN_predict

In [None]:
#I spelled this wrong but its ok because it doesnt overwrite anything
CNN_perdict = CNN.predict(image_test)
CNN_perdict

In [None]:
perdict = pd.DataFrame(CNN_perdict)
perdict['Pawpularity']= perdict

In [None]:
perdict['Real Pawpularity'] = iy_test

In [None]:
fig = px.histogram(perdict, barmode="overlay", x=['Pawpularity','Real Pawpularity'], title='Pawpularity distribution')
fig.show()