In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format

In [None]:
df = pd.read_csv('./../data/imdbtop1000/imdb_data.csv', sep='\t')
df = df.rename(columns={'User Votes': 'Votes',
                        'Imdb Rating': 'Rating',
                       'Gross(in Million Dollars)': 'Earnings',
                       'Runtime(Minutes)' : 'Runtime'})

dataframe = df[['Votes', 'Rating']]
#It is very important to normalise the input features in a proper range
#It helps in avoiding very large calculations
dataframe['Votes'] = dataframe['Votes'] / 1000000
dataframe['Rating'] = (dataframe.Rating > 7.6).astype(float)
dataframe.describe()

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataframe, test_size=0.15)

In [None]:
from tensorflow.keras import models, layers, optimizers, losses

In [None]:
feature_columns = []

votes = tf.feature_column.numeric_column("Votes")
feature_columns.append(votes)

feature_layer = layers.DenseFeatures(feature_columns)

In [None]:
def build_model(lr, feature_layer, metric):
    model = models.Sequential()
    
    model.add(feature_layer)
    model.add(layers.Dense(units=1, input_shape=(1,), activation=tf.sigmoid))
    
    model.compile(optimizer = optimizers.SGD(lr=lr),
                 loss = losses.BinaryCrossentropy(),
                 metrics = metric)
    
    return model

In [None]:
def train_model(model, dataset, label_name, epochs, batch_size=None, shuffle=True):
    
    features = {name:np.array(value) for name, value in dataset.items()}
    label = np.array(features.pop(label_name))
    
    history = model.fit(x=features, y=label, epochs=epochs, batch_size=batch_size, shuffle=shuffle)
    
    return history

In [None]:
learning_rate = 0.1
epochs = 300
batch_size = 100
label_name = "Rating"
classification_threshold = 0.5

METRIC = [tf.keras.metrics.BinaryAccuracy(name='accuracy',
                                         threshold=classification_threshold)]

model = build_model(learning_rate, feature_layer, METRIC)

history = train_model(model=model, dataset=dataframe, label_name=label_name,
                                    epochs=epochs, batch_size=batch_size, shuffle=True)

In [None]:
print(model.get_weights())

In [None]:
history = pd.DataFrame(history.history)
LOSS = history.loss
ACCURACY = history.accuracy

In [None]:
plt.figure(figsize=(11 ,5))

ax0 = plt.subplot(121)
ax0 = plt.plot(LOSS, label='LOSS')
ax0 = plt.xlabel('epochs')
ax0 = plt.ylabel('loss')

ax1 = plt.subplot(122)
ax1 = plt.plot(ACCURACY, label='ACCURACY')
ax1 = plt.xlabel('epochs')
ax1 = plt.ylabel('accuracy')

plt.legend()
plt.show()

### Model Evalation

In [None]:
features = {name:np.array(value) for name, value in test.items()}
label = np.array(features.pop(label_name))

model.evaluate(x = features, y = label, batch_size=batch_size)

### Prediction

In [None]:
features = {name:np.array(value) for name, value in test.items()}
label = np.array(features.pop(label_name))

pred = model.predict(x = features, batch_size=batch_size)

In [None]:
pred = (pred > 0.5).astype(float)

In [None]:
plt.figure()
sns.scatterplot(x=features['Votes'],y=label, label='original', alpha=0.5)
sns.scatterplot(x=features['Votes'],y=pred.reshape(-1,), label='predicted', alpha=0.5)
plt.legend()
plt.show()

### Experimenting with more evaluation metrics

In [None]:
learning_rate = 0.1
epochs = 300
batch_size = 100
label_name = "Rating"
classification_threshold = 0.5

METRIC = [tf.keras.metrics.BinaryAccuracy(name='accuracy',
                                         threshold=classification_threshold),
         tf.keras.metrics.Precision(name='precision',
                                   thresholds=classification_threshold),
         tf.keras.metrics.Recall(name='recall', 
                                 thresholds=classification_threshold)]

model = build_model(learning_rate, feature_layer, METRIC)

history = train_model(model=model, dataset=dataframe, label_name=label_name,
                                    epochs=epochs, batch_size=batch_size, shuffle=True)

In [None]:
history = pd.DataFrame(history.history)
LOSS = history.loss
ACCURACY = history.accuracy
PRECISION = history.precision
RECALL = history.recall

In [None]:
plt.figure(figsize=(11 ,5))

ax0 = plt.subplot(121)
ax0 = plt.plot(LOSS, label='LOSS')
ax0 = plt.xlabel('epochs')
ax0 = plt.ylabel('loss')

ax1 = plt.subplot(122)
ax1 = plt.plot(ACCURACY, label='ACCURACY')
ax1 = plt.plot(PRECISION, label='PRECISION')
ax1 = plt.plot(RECALL, label='RECALL')
ax1 = plt.xlabel('epochs')
ax1 = plt.ylabel('accuracy')

plt.legend()
plt.show()

In [None]:
learning_rate = 0.1
epochs = 300
batch_size = 100
label_name = "Rating"

METRIC = [tf.keras.metrics.AUC(num_thresholds=100,
                              name='auc')]

model = build_model(learning_rate, feature_layer, METRIC)

history = train_model(model=model, dataset=dataframe, label_name=label_name,
                                    epochs=epochs, batch_size=batch_size, shuffle=True)

In [None]:
history = pd.DataFrame(history.history)
LOSS = history.loss
AUC = history.auc

In [None]:
plt.figure(figsize=(11 ,5))

ax0 = plt.subplot(121)
ax0 = plt.plot(LOSS, label='LOSS')
ax0 = plt.xlabel('epochs')
ax0 = plt.ylabel('loss')

ax1 = plt.subplot(122)
ax1 = plt.plot(AUC, label='AUC')
ax1 = plt.xlabel('epochs')
ax1 = plt.ylabel('accuracy')

plt.legend()
plt.show()