In [32]:
from tensorflow import keras
import tensorflow as tf

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.models import load_model

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

import re
import csv
import tqdm.notebook as tq
import glob
import os

## Read Dataset

In [4]:
def read_file( file_name ):
  print('reading' , file_name )
  f =  open( file_name ,'r')
  lines = f.readlines()
  x = []
  y = []
  for line in tq.tqdm( lines ):
    line = line.rstrip()
    line = line.split(',')
    y.append( int( line[0] ) )
    x.append(  ' '.join( line[1:] )  )

  #convert y to one hot vector

  y = np.array(y,dtype = int)
  b = np.zeros( ( y.size, y.max()+1)  )
  b[np.arange(y.size),y] = 1
  y = b
  return x,y

## Change Paths Accordingly

In [91]:
current_dir = '/content/drive/My Drive/LSTM-deneme/'

train_path = '/content/drive/My Drive/LSTM-deneme/train.csv'
test_path  = '/content/drive/My Drive/LSTM-deneme/test.csv'

In [103]:
train_x, train_y = read_file( train_path )
test_x, test_y  = read_file( test_path  )

n_class = train_y.shape[1]

reading /content/drive/My Drive/LSTM-deneme/train.csv


HBox(children=(FloatProgress(value=0.0, max=574075.0), HTML(value='')))


reading /content/drive/My Drive/LSTM-deneme/test.csv


HBox(children=(FloatProgress(value=0.0, max=180309.0), HTML(value='')))




## Split Train and Validation Datasets

In [104]:
n_train = 500000

val_x = train_x[ n_train: ]
val_y = train_y[ n_train:, : ]

train_x = train_x[ : n_train ]
train_y = train_y[ : n_train, : ]


## Preprocessing

In [105]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_num_words = 50

# create the tokenizer
t = Tokenizer( num_words = max_num_words )

# fit the tokenizer on the documents
t.fit_on_texts( train_x )

train_x = t.texts_to_matrix( train_x , mode='count')
val_x   = t.texts_to_matrix( val_x , mode='count' )
test_x  = t.texts_to_matrix( test_x, mode='count' )

print( "shape of x: " , train_x.shape )
#print("x0 : " , x[0])
print( train_y.shape )

shape of x:  (500000, 50)
(500000, 10)


## Define Model


In [114]:
embed_dim = 128
lstm_out = 64
batch_size = 512
lr = 0.001

model = Sequential()
model.add( Embedding( train_x.shape[1], embed_dim, input_length = train_x.shape[1] ) )
model.add( LSTM( lstm_out ) )
model.add( Dense( n_class ,activation='softmax') )

opt = keras.optimizers.Adam(learning_rate=0.001)

model.compile( loss = 'categorical_crossentropy', optimizer=opt ,metrics = ['accuracy'] )
print( model.summary() )

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 50, 128)           6400      
_________________________________________________________________
lstm_12 (LSTM)               (None, 64)                49408     
_________________________________________________________________
dense_12 (Dense)             (None, 10)                650       
Total params: 56,458
Trainable params: 56,458
Non-trainable params: 0
_________________________________________________________________
None


## Define Checkpoint

In [115]:
experiments = glob.glob( current_dir + 'experiment*' )

experiment_name = 'experiment-' + str( len(experiments) + 1 )

if not os.path.exists( current_dir + experiment_name  ):
    os.makedirs( current_dir + experiment_name )

with open( current_dir + experiment_name + '/model.txt','w') as fh:
    model.summary(print_fn=lambda x: fh.write(x + '\n'))

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath = current_dir + experiment_name + '/model.{epoch:02d}-acc:{accuracy:.3f}' )

## Train Model !

In [117]:
model.fit( train_x, train_y, shuffle = True ,epochs=10, batch_size = batch_size , verbose=1, callbacks = [model_checkpoint_callback] , validation_data=( val_x, val_y ) )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f352c3f0588>

## Evalute Model

In [118]:
results = model.evaluate( test_x, test_y, batch_size=batch_size, verbose = 1 )

