# POEMS AUTHOR CLASSIFIER BASED ON ARTIFICIAL NEURAL NETWORKS

## Data preprocessing phase

In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
from __future__ import print_function
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
df = pd.read_csv("/kaggle/input/dataset.csv", header=0, error_bad_lines=False, delimiter='\t')
# Any results you write to the current directory are saved as output.

In [0]:
import collections
count = collections.Counter(df['author'].values)
print('Most common authors:\n')
print(count.most_common(10))
print('\nTotal number of authors:\n')
print(len(count))
#print(len(df['author'].unique()))

In [0]:
import nltk
df = df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(nltk.corpus.stopwords.words('english'))
MAX_SEQUENCE_LENGTH = df.stanzas.map(len).max() 
print("MAX_SEQUENCE_LENGTH = ", MAX_SEQUENCE_LENGTH)

MIN_SEQUENCE_LENGTH = df.stanzas.map(len).min() 
print("MIN_SEQUENCE_LENGTH = ", MIN_SEQUENCE_LENGTH)

MEAN_SEQUENCE_LENGTH = df.stanzas.map(len).mean() 
print("MEAN_SEQUENCE_LENGTH = ", MEAN_SEQUENCE_LENGTH)
def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
df['stanzas'] = df['stanzas'].apply(clean_text)
df['stanzas'] = df['stanzas'].str.replace('\d+', '')

In [0]:
import matplotlib.pyplot as plt
frame = {'len': df['stanzas'].astype(str).map(len)} 
result = pd.DataFrame(frame)
#print(result['len'].value_counts().max())
hist = result.plot.hist(figsize = (20,10), bins = 30, range = [1, 2725], color='DarkGreen')
hist.set_title('Stanzas length frequency', fontsize = 30)
hist.set_xlabel('# of chars in stanza', fontsize = 20)
hist.set_ylabel('Frequency', fontsize = 20)
hist.legend(prop={'size': 20})

In [0]:
df = df.replace('? ?', np.nan)
df.dropna(inplace= True)
value_counts = df['author'].value_counts()
print(value_counts)

In [0]:
# Select the values where the count is less than a certain number
to_remove = value_counts[value_counts < 500].index
# Keep rows where the city column is not in to_remove
df = df[~df.author.isin(to_remove)]
df

## Neural network models

In [0]:
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.preprocessing import sequence
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from keras.layers import Dense, Dropout, Activation, Embedding, SpatialDropout1D, LSTM, Conv1D, GlobalMaxPooling1D, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 40000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 200
# This is fixed.
EMBEDDING_DIM = 250
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['stanzas'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(df['stanzas'].values)
X = sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(df['author']).values
print('Shape of label tensor:', Y.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.30, random_state = 42)
print("Shape of training set = ", X_train.shape,Y_train.shape)
print("Shape of test set = ", X_test.shape,Y_test.shape)

### 1st model: bidirectional LSTM

In [0]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
#model.add(Dropout(0.2))

In [0]:
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(Y_train.shape[1], activation='sigmoid'))

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
epochs = 4
batch_size = 32

#history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test), callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test))
model.save_weights("model1.h5")

In [0]:
# convert the history.history dict to a pandas DataFrame:     
hist_df = pd.DataFrame(history.history) 

# save to json:  
hist_json_file = 'history1.json' 
with open(hist_json_file, mode='w') as f:
    hist_df.to_json(f)

# or save to csv: 
hist_csv_file = 'history1.csv'
with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)

In [0]:
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.savefig('accuracy.png')

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.savefig('loss.png')

### 2nd model: standard LSTM

In [0]:
model3 = Sequential()
model3.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model3.add(Dropout(0.2))

In [0]:
model3.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model3.add(Dense(Y_train.shape[1], activation='softmax'))

In [0]:
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.summary()
epochs = 3
batch_size = 64

#history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test), callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
history3 = model3.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test))
model3.save_weights("model3.h5")

In [0]:
# convert the history.history dict to a pandas DataFrame:     
hist_df = pd.DataFrame(history3.history) 

# save to json:  
hist_json_file = 'history3.json' 
with open(hist_json_file, mode='w') as f:
    hist_df.to_json(f)

# or save to csv: 
hist_csv_file = 'history3.csv'
with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)

In [0]:
# Plot training & validation accuracy values
plt.plot(history3.history['accuracy'])
plt.plot(history3.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.savefig('accuracy.png')

# Plot training & validation loss values
plt.plot(history3.history['loss'])
plt.plot(history3.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.savefig('loss.png')

### 3rd model: CNN

In [0]:
model2 = Sequential()
model2.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model2.add(Dropout(0.2))

In [0]:
filters = 250
kernel_size = 3
hidden_dims = 250
model2.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1))
model2.add(GlobalMaxPooling1D())
model2.add(Dense(hidden_dims))
model2.add(Dropout(0.2))
model2.add(Activation('relu'))
model2.add(Dense(Y_test.shape[1]))
model2.add(Activation('sigmoid'))

In [0]:
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.summary()
epochs = 3
batch_size = 32

history2 = model2.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test), callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
#history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, Y_test))
model.save_weights("model2.h5")

In [0]:
# convert the history.history dict to a pandas DataFrame:     
hist_df = pd.DataFrame(history2.history) 

# save to json:  
hist_json_file = 'history2.json' 
with open(hist_json_file, mode='w') as f:
    hist_df.to_json(f)

# or save to csv: 
hist_csv_file = 'history2.csv'
with open(hist_csv_file, mode='w') as f:
    hist_df.to_csv(f)

In [0]:
# Plot training & validation accuracy values
plt.plot(history2.history['accuracy'])
plt.plot(history2.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.savefig('accuracy2.png')

# Plot training & validation loss values
plt.plot(history2.history['loss'])
plt.plot(history2.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.savefig('loss2.png')