In [None]:
# This file is for the sole purpose of training and/or
# recrating the model and is not to be used in the production
# process because it would reload and recreate all the modules
# all the time, hence, reducing performance

In [74]:
from PIL import Image
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, hamming_loss, r2_score
import os
import pandas
from matplotlib import pyplot, image
import numpy

In [75]:
current_dir = os.path.dirname(os.path.abspath(__name__))
score_file_path = os.path.join(current_dir, 'score_file2.csv')

In [76]:
data = pandas.read_csv(score_file_path)
columns = ['name', 'score', 'match']
df = pandas.DataFrame(data=data, columns=columns)

In [77]:
# Put the image path in the dataframe
def create_path(name):
    image_path = os.path.join(current_dir, 'media', 'data', 'valid', name)
    return image_path

df['path'] = df['name'].apply(create_path)
df[:1]

Unnamed: 0,name,score,match,path
0,fw1.jpg,3,1,c:\Users\Pende\Documents\myapps\bots\tinder_sw...


In [95]:
# Normalize images from range 0-255 to
# range 0-1 in order to facilitate the
# learning process by the neural network
test_image = Image.open(df['path'][1])
pixels = numpy.asarray(test_image)
# Confirm that the image range is 0-255
print(f'Initial values: {pixels.dtype}, Min: {pixels.min()}, Max: {pixels.max()}')

# Convert from intergers to floats
pixels = pixels.astype('float32')

# Normalize pixels to range 0, 1
pixels /= 255.0
print(f'After normarlization the min value is {pixels.min()} and the max {pixels.max()}')

# Once normalized we need to center the pixels
# around zero or the mean. This creates a good
# distribution around the mean
mean = pixels.mean()
pixels = pixels - mean
# Confirm the centering
print(f'Mean {pixels.mean()}, Min: {pixels.min()}, Max: {pixels.max()}')

Initial values: uint8, Min: 0, Max: 255
After normarlization the min value is 0.0 and the max 1.0
Mean 5.954041881750527e-08, Min: -0.4042842984199524, Max: 0.5957157015800476


In [59]:
without_name = df[['score', 'match']]
without_name.groupby(['match']).count()

Unnamed: 0_level_0,score
match,Unnamed: 1_level_1
0,36
1,64


In [60]:
without_name.groupby(['score']).count()

Unnamed: 0_level_0,match
score,Unnamed: 1_level_1
1,14
2,30
3,56


In [62]:
matched = df[df['match'] == 1]
not_matched = df[df['match'] == 0]

matched.head()

Unnamed: 0,name,score,match
0,fw1.jpg,3,1
5,fw13.jpg,3,1
6,fw14.jpg,2,1
7,fw15.jpg,3,1
8,fw16.jpg,3,1


In [56]:
X = df[['match']]
y = df['score']

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [41]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [43]:
observation = [[1]]
prediction = model.predict(observation)
print('For ... the prediction is %s' % prediction)

For ... the prediction is [2.79591837]


In [45]:
predictions = model.predict(X_train)
print(predictions[:5])

[1.80769231 2.79591837 2.79591837 2.79591837 1.80769231]


In [49]:
# accuracy = accuracy_score(X_train, predictions)
# print('Accuracy is %s' % accuracy)

# score = model.score(X, y)
# print('Model score is %s' % score)

r2 = r2_score(X_train, predictions)
print('R² score is %s' % r2)

# hamming = hamming_loss(X_train, predictions)
# print('Hamming loss is %s' % hamming)

R² score is -13.305476145294485


In [50]:
# Saving the model
import pickle

with open('cnn_model.sav', 'wb') as m:
    pickle.dump(model, m)