# ANNs, Keras

Iris dataset

In [None]:
import numpy as np
from sklearn.datasets import load_iris

iris = load_iris()

In [None]:
type(iris)

In [None]:
# print out description of dataset
print(iris.DESCR)

In [None]:
# petal and sepal measures as features
iris.feature_names

In [None]:
# 3 target labels
iris.target_names

In [None]:
# iris.data is a numpy array with the four measurements in a row
X = iris.data
# iris.target is a sorted numpy array
y = iris.target

In [None]:
# 150 elements
y.shape

In [None]:
# next step: one-hot encoding 
# class 1 --> [1,0,0]
# class 2 --> [0,1,0]
# class 3 --> [0,0,1]

from keras.utils import to_categorical

# transforms each element in y to a 3-dim vector with '1' in the index position corresponding to the element (e.g. [..., 0, ...] -> [1,0,0])
y = to_categorical(y)

In [None]:
# now its 150x3 
y.shape

In [None]:
y

In [None]:
# split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

# data has been shuffled and randomised
y_train

In [None]:
# for neural networks it is good to scale or standardise data
from sklearn.preprocessing import MinMaxScaler

scaler_object = MinMaxScaler()
# makes all the values fit btw a range // like dividing elements of a list by the biggest element 
scaler_object.fit(X_train)

In [None]:
scaled_X_train = scaler_object.transform(X_train)
scaled_X_test = scaler_object.transform(X_test)

In [None]:
# build NN with keras
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
# two layers of nr of neurons, 4 feature input, activation function is rectified linear unit
model.add(Dense(8,input_dim=4, activation='relu'))
model.add(Dense(8,input_dim=4, activation='relu'))
# output layer
model.add(Dense(3,input_dim=4, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(scaled_X_train,y_train,epochs=150,verbose=2)

In [None]:
# predicts probabilities
#model.predict(scaled_X_test)
# predicts classes
model.predict_classes(scaled_X_test)

In [None]:
predictions = model.predict_classes(scaled_X_test)

In [None]:
# this return the actual classes
y_test_classes = y_test.argmax(axis=1)

In [None]:
# now we have predictions and original y_test values in the same format
# we can compare them
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

print("Confusion matrix:")
print(confusion_matrix(y_test_classes,predictions))
print("\n")
print("Classification report:")
print(classification_report(y_test_classes,predictions))
print("\n")
print("Accuracy:")
print(accuracy_score(y_test_classes,predictions))

In [None]:
# this saves the model, all weights, etc... 
# (overwrites)
model.save('mymodel.h5')

In [None]:
# ... so you can load the model again
from keras.models import load_model 

new_model = load_model('mymodel.h5')

# RNNs
LSTM and text generation

In [None]:
# read in text
def read_file(filepath):
    with open(filepath) as f:
        text = f.read()
    return text

In [None]:
read_file('../../pythongyak/UPDATED_NLP_COURSE/06-Deep-Learning/moby_dick_four_chapters.txt')

In [None]:
import spacy

# we only want tokenisation now
nlp = spacy.load('en',disable=['parser','tagger','ner'])
# in case of working with a large text, this might be handy
nlp.max_length = 1198623

In [None]:
# clean text (get rid of some punctuation)
def separate_punctuation(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [None]:
moby_dick = read_file('../../pythongyak/UPDATED_NLP_COURSE/06-Deep-Learning/moby_dick_four_chapters.txt')
tokens = separate_punctuation(moby_dick)

In [None]:
tokens

In [None]:
len(tokens)

In [None]:
# pass in 25 words, and have the NN predict word #26
train_len = 25+1

text_sequences = []

for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)

In [None]:
# first sequence
' '.join(text_sequences[0])

In [None]:
# tokenise text and create sequences
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [None]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [None]:
# number sequences replaced the word sequences, each nr is an id for a word
sequences[0]

In [None]:
# id : word pairs
tokenizer.index_word

In [None]:
# word counts
tokenizer.word_counts

In [None]:
# nr of tokens
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

In [None]:
# sequences is now just a list, we can cast it as a numpy array
import numpy as np

sequences = np.array(sequences)
sequences

In [None]:
# split data into
# X features -- first n words of sequence
# y labels -- n+1 word
from keras.utils import to_categorical

# values from all rows, grab everything but the last column (label column)
X = sequences[:,:-1]

In [None]:
# values from last column
y = sequences[:,-1]

In [None]:
y = to_categorical(y,num_classes=vocabulary_size+1)

In [None]:
# set seq_len to '25'
seq_len = X.shape[1]

In [None]:
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding

def create_model(vocabulary_size,seq_len):
    model = Sequential()
    # first layer of the model
    # input dimension = vocab size, output dimension = se_len, input length = se_len
    model.add(Embedding(vocabulary_size,seq_len,input_length=seq_len))
    # nr of neurons (preferably multiple of seq_len)
    model.add(LSTM(seq_len*2,return_sequences=True))
    model.add(LSTM(seq_len*2))
    # this will provide a number
    model.add(Dense(50,activation='relu'))
    # this will translate that number into a word
    model.add(Dense(vocabulary_size,activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
   
    model.summary()
    
    return model

In [None]:
model = create_model(vocabulary_size+1,seq_len)

In [None]:
from pickle import dump,load

# train model with features, labels, how many seqs you pass in at a time, (should be at least 200), output report
model.fit(X,y,batch_size=128,epochs=5,verbose=1)

In [None]:
# save model and tokenizer
model.save('my_great_whale_model.h5')
dump(tokenizer,open('my_little_tokenizer','wb'))

In [None]:
# NOW, generate new text

In [None]:
from keras.preprocessing.sequence import pad_sequences

# seed_text: some text to start on
# num_gen: number of words to generate
def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):
    output_text = []
    # initial seed text
    input_text = seed_text
    
    # generate num_gen_words words
    for i in range(num_gen_words):
        # take input text string and encode it to be a sequence (seqs of numbers)
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        # if seed text is too short or too long, it might need to be padded
        pad_encoded = pad_sequences([encoded_text],maxlen=seq_len, truncating='pre')
        # predict class probabilities (most likely next word) for each word index
        pred_word_index = model.predict_classes(pad_encoded,verbose=0)[0]
        # actual most likely next (predicted) word 
        pred_word = tokenizer.index_word[pred_word_index]
        # take input text and add on predicted word to use as input text and chop off first word (<-- 'pre')
        input_text += ' '+pred_word
        # append word
        output_text.append(pred_word)
    
    
    return ' '.join(output_text)

In [None]:
# generate seed sequence

# select one yourself
text_sequences[0]

In [None]:
# or select one randomly
import random

random.seed(101)
random_pick = random.randint(0,len(text_sequences))

random_seed_text = text_sequences[random_pick]
random_seed_text

In [None]:
seed_text = ' '.join(random_seed_text)
seed_text

In [None]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

In [None]:
from keras.models import load_model

model = load_model('../../pythongyak/UPDATED_NLP_COURSE/06-Deep-Learning/epochBIG.h5')
tokenizer = load(open('../../pythongyak/UPDATED_NLP_COURSE/06-Deep-Learning/epochBIG','rb'))


In [None]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)