In [None]:
from tensorflow import keras
import tensorflow as tf

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.models import load_model
from keras.layers import SimpleRNN

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

import re
import csv
import tqdm.notebook as tq
import glob
import os

## Read Dataset

In [None]:
def read_file( file_name ):
  print('reading' , file_name )
  f =  open( file_name ,'r')
  lines = f.readlines()
  x = []
  y = []
  for line in tq.tqdm( lines ):
    line = line.rstrip()
    line = line.split(',')
    y.append( int( line[0] ) )
    x.append(  ' '.join( line[1:] )  )

  #convert y to one hot vector

  y = np.array(y,dtype = int)
  b = np.zeros( ( y.size, y.max()+1)  )
  b[np.arange(y.size),y] = 1
  y = b
  return x,y

## Change Paths Accordingly

In [None]:
current_dir = '/content/drive/My Drive/LSTM-deneme/'
train_path = '/content/drive/My Drive/LSTM-deneme/train.csv'
test_path  = '/content/drive/My Drive/LSTM-deneme/test.csv'

In [None]:
train_x, train_y = read_file( train_path )
test_x, test_y  = read_file( test_path  )

n_class = train_y.shape[1]

reading /content/drive/My Drive/LSTM-deneme/train.csv


HBox(children=(FloatProgress(value=0.0, max=574075.0), HTML(value='')))


reading /content/drive/My Drive/LSTM-deneme/test.csv


HBox(children=(FloatProgress(value=0.0, max=180309.0), HTML(value='')))




## Split Train and Validation Datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
n_train = 500000

val_x = train_x[ n_train: ]
val_y = train_y[ n_train:, : ]

train_x = train_x[ : n_train ]
train_y = train_y[ : n_train, : ]

## Preprocessing

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_num_words = 10000

# create the tokenizer
t = Tokenizer( num_words = max_num_words )

# fit the tokenizer on the documents
t.fit_on_texts( train_x )

train_x = t.texts_to_sequences( train_x )
val_x   = t.texts_to_sequences( val_x )
test_x  = t.texts_to_sequences( test_x )

train_x = pad_sequences(  train_x )
val_x = pad_sequences(  val_x )
test_x = pad_sequences(  test_x )

## Define Model


In [None]:
embed_dim = 128
lstm_out = 64
batch_size = 1024
lr = 0.01

model = Sequential()
model.add( Embedding( max_num_words, embed_dim, input_length = train_x.shape[1] ) )
model.add( SimpleRNN( lstm_out ) )
model.add( Dense( n_class ,activation='softmax') )

opt = keras.optimizers.Adam( learning_rate=lr )
model.compile( loss = 'categorical_crossentropy', optimizer=opt ,metrics = ['accuracy'] )
print( model.summary() )

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 34, 128)           1280000   
_________________________________________________________________
simple_rnn_6 (SimpleRNN)     (None, 64)                12352     
_________________________________________________________________
dense_6 (Dense)              (None, 10)                650       
Total params: 1,293,002
Trainable params: 1,293,002
Non-trainable params: 0
_________________________________________________________________
None


## Train Model !

In [None]:
!rm -r rnn_best
checkpoint_filepath = 'rnn_best'

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

model.fit( train_x, train_y, shuffle = True ,epochs=10, batch_size = batch_size , verbose=1 ,callbacks=[model_checkpoint_callback], validation_data=( val_x, val_y ) )

Epoch 1/10
INFO:tensorflow:Assets written to: rnn_best/assets
Epoch 2/10
INFO:tensorflow:Assets written to: rnn_best/assets
Epoch 3/10
INFO:tensorflow:Assets written to: rnn_best/assets
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f344b909048>

In [None]:
model = keras.models.load_model("rnn_best")

## Evalute Model

In [None]:
results = model.evaluate( test_x, test_y, verbose = 1 )

