In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/dataset-4/train.csv
/kaggle/input/dataset-4/test.csv
/kaggle/input/dataset-4/dataset of intent.csv
/kaggle/input/atis-airlinetravelinformationsystem/atis_intents_train.csv
/kaggle/input/atis-airlinetravelinformationsystem/atis_intents.csv
/kaggle/input/atis-airlinetravelinformationsystem/atis_intents_test.csv


# **Read Data**

In [2]:
import pandas as pd
train_data= pd.read_csv('//kaggle/input/dataset-4/train.csv',
                       names= ["target", "text"])

test_data= pd.read_csv('/kaggle/input/dataset-4/test.csv',
                       names= ["target", "text"])

train_data

Unnamed: 0,target,text
0,classs,question
1,Code,Code example of All access controls must fail ...
2,Code,Code example of Screen scraping data harvest ?
3,Description,Elaborate Accessible non parsed dynamic scripts ?
4,Code,Code of Insecure transmission of session cookies
...,...,...
4777,Description,What can you tell me about Absolute session ti...
4778,Solution,How to get secured against Access control patt...
4779,Description,Elaborate Session cookies without the Secure f...
4780,Solution,How to mitigate Cryptographic modules should o...


# **Data Preprocessing**

**Count data based on label**

In [3]:
train_data.groupby("target").count()

Unnamed: 0_level_0,text
target,Unnamed: 1_level_1
Code,609
Description,2235
Solution,1829
classs,1
security_control,108


**Resample training data**

In [4]:
# Resample was done merely by copy data where target are atis_flight_time and atis_quantity. 
# More proper resampling method can be used if preferred
train_data= train_data.append(train_data.loc[train_data.target.isin(["atis_flight_time", "atis_quantity"]), :])

## **Target One Hot Encoding**

In [5]:
from sklearn.preprocessing import OneHotEncoder as OHE

y_encoder= OHE().fit(np.array(train_data.target).reshape(-1,1))

In [6]:
ytr_encoded= y_encoder.transform(np.array(train_data.target).reshape(-1,1)).toarray()
yts_encoded= y_encoder.transform(np.array(test_data.target).reshape(-1,1)).toarray()

## **Text Preprocessing With NLTK and Tensorflow**

In [7]:
import nltk

**Convert text to lowercase**

In [8]:
train_data["lower_text"]= train_data.text.map(lambda x: x.lower())
test_data["lower_text"]= test_data.text.map(lambda x: x.lower())

**Word Tokenize**

In [9]:
from nltk import word_tokenize

train_data["tokenized"]= train_data.lower_text.map(word_tokenize)
test_data["tokenized"]= test_data.lower_text.map(word_tokenize)

**Remove Stop Words**

In [10]:
from nltk.corpus import stopwords
from string import punctuation

def remove_stop(strings, stop_list):
    classed= [s for s in strings if s not in stop_list]
    return classed

stop= stopwords.words("english")
stop_punc= list(set(punctuation))+ stop

train_data["selected"]= train_data.tokenized.map(lambda df: remove_stop(df, stop_punc))
test_data["selected"]= test_data.tokenized.map(lambda df: remove_stop(df, stop_punc))

**Stemming**

In [11]:
from nltk.stem import PorterStemmer

def normalize(text):
    return " ".join(text)

stemmer= PorterStemmer()

train_data["stemmed"]= train_data.selected.map(lambda xs: [stemmer.stem(x) for x in xs])
train_data["normalized"]= train_data.stemmed.apply(normalize)

test_data["stemmed"]= test_data.selected.map(lambda xs: [stemmer.stem(x) for x in xs])
test_data["normalized"]= test_data.stemmed.apply(normalize)

**Tokenize with tensorflow**

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer= Tokenizer(num_words= 10000)
tokenizer.fit_on_texts(train_data.normalized)

tokenized_train= tokenizer.texts_to_sequences(train_data.normalized)
tokenized_test= tokenizer.texts_to_sequences(test_data.normalized)

In [13]:
tokenizer.word_index.keys().__len__()

438

**Pad Text**

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_padded= pad_sequences(tokenized_train, maxlen= 20, padding= "pre")
test_padded= pad_sequences(tokenized_test, maxlen= 20, padding= "pre")

In [15]:
train_padded.shape

(4782, 20)

**Create X Matrix (samples, steps, wordlist)**

In [16]:
#this function transform final processed text (columns padded) into 3D matrix (samples, steps, unique_words)
#matrix contents one hot encoded words. Encoding was done for each step and based on unique words

def transform_x(data, tokenizer):
    output_shape= [data.shape[0],
                  data.shape[1],
                  tokenizer.word_index.keys().__len__()]
    results= np.zeros(output_shape)
    
    for i in range(data.shape[0]):
        for ii in range(data.shape[1]):
            results[i, ii, data[i,ii]-1]= 1
    return results

xtr_transformed= transform_x(train_padded, tokenizer)
xts_transformed= transform_x(test_padded, tokenizer)

# **LSTM Modelling**

In [17]:
from tensorflow.keras.layers import Dense, LSTM, BatchNormalization, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy as CC
from tensorflow.keras.activations import relu, softmax
from tensorflow.keras.initializers import he_uniform, glorot_uniform
from tensorflow.keras.metrics import AUC
from tensorflow.keras import Model
from tensorflow.keras.regularizers import l2


class LSTMModel(object):
    
    def build_model(self, input_dim, output_shape, steps, dropout_rate, kernel_regularizer, bias_regularizer):
        input_layer= Input(shape= (steps, input_dim))
        
        #make lstm_layer
        lstm= LSTM(units= steps)(input_layer)
        dense_1= Dense(output_shape, kernel_initializer= he_uniform(),
                       bias_initializer= "zeros", 
                       kernel_regularizer= l2(l= kernel_regularizer),
                       bias_regularizer= l2(l= bias_regularizer))(lstm)
        x= BatchNormalization()(dense_1)
        x= relu(x)
        x= Dropout(rate= dropout_rate)(x)
        o= Dense(output_shape, kernel_initializer= glorot_uniform(),
                 bias_initializer= "zeros", 
                 kernel_regularizer= l2(l= kernel_regularizer), 
                 bias_regularizer= l2(l= bias_regularizer))(dense_1)
        o= BatchNormalization()(o)
        output= softmax(o, axis= 1)
        
        loss= CC()
        metrics= AUC()
        optimizer= Adam()
        self.model= Model(inputs= [input_layer], outputs= [output])
        self.model.compile(optimizer= optimizer, loss= loss, metrics= [metrics])
        
        
    def train(self, x, y, validation_split, epochs):
        self.model.fit(x, y, validation_split= validation_split, epochs= epochs)
        
    def predict(self, x):
        return self.model.predict(x)

**Build Model**

In [18]:
steps= xtr_transformed.shape[1]
dim= xtr_transformed.shape[2]
output_shape= ytr_encoded.shape[1]

model= LSTMModel()
model.build_model(input_dim= dim,
                  output_shape= output_shape,
                  steps= steps, 
                  dropout_rate= 0.5, 
                  bias_regularizer= 0.3, 
                  kernel_regularizer= 0.3)

In [19]:
model.train(xtr_transformed, ytr_encoded,
           0.2, 10)

Train on 3825 samples, validate on 957 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


**Evaluation**

Train

In [20]:
from sklearn.metrics import classification_report

prediction= y_encoder.inverse_transform(model.predict(xtr_transformed))
print(classification_report(train_data.target, prediction))

                  precision    recall  f1-score   support

            Code       1.00      1.00      1.00       609
     Description       0.99      0.99      0.99      2235
        Solution       0.97      1.00      0.99      1829
          classs       0.00      0.00      0.00         1
security_control       1.00      0.46      0.63       108

        accuracy                           0.99      4782
       macro avg       0.79      0.69      0.72      4782
    weighted avg       0.99      0.99      0.98      4782



  _warn_prf(average, modifier, msg_start, len(result))


Test

In [21]:
from sklearn.metrics import classification_report

prediction_test= y_encoder.inverse_transform(model.predict(xts_transformed))
print(classification_report(test_data.target, prediction_test))

                  precision    recall  f1-score   support

            Code       1.00      1.00      1.00       153
     Description       0.99      1.00      0.99       559
        Solution       0.97      1.00      0.99       458
          classs       0.00      0.00      0.00         1
security_control       1.00      0.48      0.65        27

        accuracy                           0.99      1198
       macro avg       0.79      0.70      0.73      1198
    weighted avg       0.99      0.99      0.98      1198



  _warn_prf(average, modifier, msg_start, len(result))
