Keras neural network training for classifying questions (1) and answers (0)

In [None]:
#Reference: https://github.com/douglasdcm/chatbot_for_movies/tree/master/notebooks

In [None]:
!pip install tensorflow



In [None]:
import string
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import pandas as pd
import re
import gensim
from tensorflow.keras.layers import Dense, Activation, Dropout # Changed import statement to use tensorflow.keras
from tensorflow.keras.optimizers import SGD # Changed import statement to use tensorflow.keras
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.models import Sequential # Changed import statement to use tensorflow.keras
from tensorflow.keras.preprocessing.text import Tokenizer # Changed import statement to use tensorflow.keras
from scipy.spatial import distance
from sklearn.model_selection import train_test_split
import math
import random
import bz2
import itertools
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping # Changed import statement to use tensorflow.keras
import os
from os import listdir
from os.path import isfile, join
from sklearn.metrics import f1_score
from tensorflow.keras.models import load_model # Changed import statement to use tensorflow.keras

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#expand jupyter cells
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# prompt: mount google drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:


messages = pd.read_csv('/content/drive/MyDrive/movie-corpus/movie_lines_pre_processed_keras.tsv', delimiter="\t", quoting=3, encoding='ISO-8859-2')

In [None]:
messages.columns = ['msg_line', 'user_id', 'movie_id', 'msg', 'msg_pre_processed', 'msg_2', 'target']

In [None]:
messages.head()

Unnamed: 0,msg_line,user_id,movie_id,msg,msg_pre_processed,msg_2,target
0,L50,u3,m0,No.,You might wanna think about it,no,0
1,L51,u0,m0,You might wanna think about it,talk more about it,you might wanna think about it,0
2,L59,u9,m0,I missed you.,It says here you exposed yourself to a group o...,i missed you,0
3,L60,u8,m0,It says here you exposed yourself to a group o...,It was a bratwurst. I was eating lunch.,it say here you exposed yourself to a group of...,0
4,L61,u9,m0,It was a bratwurst. I was eating lunch.,With the teeth of your zipper?,it wa a bratwurst i wa eating lunch,0


Processing for deep learning

In [None]:
#setting the sample data for tests
i = 0
n = 20000

In [None]:
X_train, X_test, y_train, y_test = train_test_split(messages['msg_pre_processed'][i:n].astype(str), messages['target'][i:n].astype(str), test_size=0.33, stratify=messages['target'][i:n], random_state=42)

In [None]:
#dataframe with sample X and y
df_small = pd.DataFrame()

In [None]:
df_small['msg_pre_processed'] = X_train

In [None]:

df_small['target'] = y_train

In [None]:
df_small.head()

Unnamed: 0,msg_pre_processed,target
17249,can you explain it better?,0
10205,- and for my feelings in particular! For my pl...,0
16875,maybe...,0
16512,It's for my complexion.,1
965,maybe...,0


In [None]:
df_small.shape

(13400, 2)

In [None]:
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(X_train)

In [None]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
X_train

Unnamed: 0,msg_pre_processed
17249,can you explain it better?
10205,- and for my feelings in particular! For my pl...
16875,maybe...
16512,It's for my complexion.
965,maybe...
...,...
6797,maybe...
11892,talk more about it
17980,He'll tell me when he gets home.
17017,Well probably.


In [None]:
y_train

Unnamed: 0,target
17249,0
10205,0
16875,0
16512,1
965,0
...,...
6797,0
11892,0
17980,1
17017,1


In [None]:
# encode training data set
X_train_token = tokenizer.texts_to_matrix(X_train)

In [None]:
X_train_token

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
X_train_token.shape

(13400, 8749)

In [None]:
#set the number of rows of X_train
num_rows, num_cols = X_train_token.shape

In [None]:
classes = set(df_small['target'])
classes

{'0', '1'}

In [None]:
df_small['target'] = df_small['target'].astype('int')

In [None]:
df_small.head()

Unnamed: 0,msg_pre_processed,target
17249,can you explain it better?,0
10205,- and for my feelings in particular! For my pl...,0
16875,maybe...,0
16512,It's for my complexion.,1
965,maybe...,0


Search for the best parameters

Training the model with fixed parameters

In [None]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(20, input_dim=num_cols, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# prompt: Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model

# Compile the model using SGD with Nesterov momentum
sgd = SGD(learning_rate=0.01, momentum=0.9, nesterov=True) # Added learning_rate and momentum
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [None]:
callbacks = [EarlyStopping(monitor='val_accuracy', patience=10, verbose=0),
                # Change the filepath to end with .keras
                ModelCheckpoint(filepath='model.{val_accuracy:.2f}-{epoch:02d}.keras'), # Changed extension to .keras
            ]

#fitting and saving the model
hist = model.fit(X_train_token, df_small['target'], epochs=500, validation_split=0.3, batch_size=20, verbose=1, callbacks=callbacks)
#model.save('chatbot_model.h5', hist)

print("model created")

Epoch 1/500
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.6811 - loss: 0.6349 - val_accuracy: 0.6801 - val_loss: 0.6200
Epoch 2/500
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6762 - loss: 0.6238 - val_accuracy: 0.6801 - val_loss: 0.6161
Epoch 3/500
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.6668 - loss: 0.6266 - val_accuracy: 0.6803 - val_loss: 0.6114
Epoch 4/500
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.6779 - loss: 0.6153 - val_accuracy: 0.6794 - val_loss: 0.6085
Epoch 5/500
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.6925 - loss: 0.6025 - val_accuracy: 0.6803 - val_loss: 0.6129
Epoch 6/500
[1m469/469[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6877 - loss: 0.6044 - val_accuracy: 0.6796 - val_loss: 0.6100
Epoch 7/500
[1m469/46

In [None]:
mypath = os.getcwd()
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f)) and 'model.' in f]

In [None]:
onlyfiles.sort(reverse=True)

In [None]:
onlyfiles

['model.0.69-18.keras',
 'model.0.69-13.keras',
 'model.0.68-23.keras',
 'model.0.68-22.keras',
 'model.0.68-21.keras',
 'model.0.68-20.keras',
 'model.0.68-19.keras',
 'model.0.68-17.keras',
 'model.0.68-16.keras',
 'model.0.68-15.keras',
 'model.0.68-14.keras',
 'model.0.68-12.keras',
 'model.0.68-11.keras',
 'model.0.68-10.keras',
 'model.0.68-09.keras',
 'model.0.68-08.keras',
 'model.0.68-07.keras',
 'model.0.68-06.keras',
 'model.0.68-05.keras',
 'model.0.68-04.keras',
 'model.0.68-03.keras',
 'model.0.68-02.keras',
 'model.0.68-01.keras']

In [None]:
print(onlyfiles[0])
model = load_model(onlyfiles[0])

model.0.69-18.keras


In [None]:
lemmatizer = WordNetLemmatizer()
def pre_processing_text(corpus):
    #remove html tags
    corpus = re.sub(r'<.*?>', '', str(corpus))

    #remove non-alphanumeric characters
    corpus = re.sub(r'[^a-z A-Z 0-9 \s]', '', str(corpus))

    #remove duplicated spaces
    corpus = re.sub(r' +', ' ', str(corpus))

    #capitalization
    corpus = corpus.lower()

    #tokenization
    corpus = re.findall(r"\w+(?:'\w+)?|[^\w\s]", corpus)

    #lammatization
    corpus = [lemmatizer.lemmatize(c) for c in corpus]

    #remove punctuation
    corpus = [t for t in corpus if t not in string.punctuation]

    #remove stopwords
    #it makes the model worst
    #stopwords_ = stopwords.words("english")
    #corpus = [t for t in corpus if t not in stopwords_]

    return ' '.join(corpus)

In [None]:
msg_raw = 'I heard you are a student. Is it right?'

In [None]:
msg = pre_processing_text(msg_raw)

In [None]:
p = tokenizer.texts_to_matrix([msg])

In [None]:
p.shape

(1, 8749)

In [None]:
res = model.predict(p)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step


In [None]:
res

array([[0.28862616]], dtype=float32)

In [None]:
# encode training data set
X_test_token = tokenizer.texts_to_matrix(X_test)

In [None]:
y_pred = model.predict(X_test_token)

[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [None]:
y_test_int = [int(y) for y in y_test]

In [None]:
f1_score(y_test_int, y_pred.round())

0.32350201800683015

In [None]:
# prompt: accuracy, precision, recall, and F1-Score

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming y_test_int and y_pred are defined as in the original code
# Calculate accuracy
accuracy = accuracy_score(y_test_int, y_pred.round())
print(f"Accuracy: {accuracy}")

# Calculate precision
precision = precision_score(y_test_int, y_pred.round())
print(f"Precision: {precision}")

# Calculate recall
recall = recall_score(y_test_int, y_pred.round())
print(f"Recall: {recall}")

# Calculate F1-score
f1 = f1_score(y_test_int, y_pred.round())
print(f"F1-score: {f1}")

Accuracy: 0.6698484848484848
Precision: 0.48107109879963067
Recall: 0.24368568755846587
F1-score: 0.32350201800683015
