A neural network consist of cnn layer (Kim,2014) and 4 fully connected layers.

Source: https://github.com/jojonki/cnn-for-sentence-classification





In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/sharif/DeepLearning/ipython(guide)')

In [3]:
import numpy as np
import codecs
import os
import random
import pandas
from keras import backend as K
from keras.models import Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Dense, Lambda, Permute, Dropout
from keras.layers import Conv2D, MaxPooling1D
from keras.optimizers import SGD
import ast
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import gensim
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [4]:
limit_number = 750
data = pandas.read_csv('../Data/limited_to_'+str(limit_number)+'.csv',index_col=0,converters={'body': eval})
data = data.dropna().reset_index(drop=True)
X = data["body"].values.tolist()
y = pandas.read_csv('../Data/limited_to_'+str(limit_number)+'.csv')
labels = []
tag=[]
for item in y['tag']:
  labels += [i for i in re.sub('\"|\[|\]|\'| |=','',item.lower()).split(",") if i!='' and i!=' ']
  tag.append([i for i in re.sub('\"|\[|\]|\'| |=','',item.lower()).split(",") if i!='' and i!=' '])
labels = list(set(labels))
mlb = MultiLabelBinarizer()
Y=mlb.fit_transform(tag)

In [5]:
len(labels)

78

In [6]:
sentence_maxlen = max(map(len, (d for d in X)))
print('sentence maxlen', sentence_maxlen)

sentence maxlen 300


In [7]:
freq_dist = pandas.read_csv('../Data/FreqDist_sorted.csv',index_col=False)
vocab=[]
for item in freq_dist["word"]:
  try:
    word=re.sub(r"[\u200c-\u200f]","",item.replace(" ",""))
    vocab.append(word)
  except:
    pass
  
print(vocab[10])

زبان


In [8]:
vocab = sorted(vocab)
vocab_size = len(vocab)

In [9]:
print('vocab size', len(vocab))
w2i = {w:i for i,w in enumerate(vocab)}
# i2w = {i:w for i,w in enumerate(vocab)}
print(w2i["زبان"])

vocab size 225345
129280


In [10]:
def vectorize(data, sentence_maxlen, w2i):
    vec_data = []
    
    for d in data:
       
        vec = [w2i[w] for w in d if w in w2i]
        pad_len = max(0, sentence_maxlen - len(vec))
        vec += [0] * pad_len
        vec_data.append(vec)
        # print(d)
        
    vec_data = np.array(vec_data)
    
    return vec_data

vecX = vectorize(X, sentence_maxlen, w2i)
vecY=Y

In [11]:
X_train, X_test, y_train, y_test = train_test_split(vecX, vecY, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)
print('train: ', X_train.shape , '\ntest: ', X_test.shape , '\nval: ', X_val.shape ,"\ny_tain:",y_train.shape )
# print(vecX[0])

train:  (12935, 300) 
test:  (4312, 300) 
val:  (4312, 300) 
y_tain: (12935, 78)


In [12]:
embd_dim = 300


# ***If the word2vec model is not generated before, we should run the next block.***

In [13]:
# embed_model = gensim.models.Word2Vec(X, size=embd_dim, window=5, min_count=5)
# embed_model.save('word2vec_model')

# ***Otherwise, we can run the next block.***

In [14]:
embed_model=gensim.models.Word2Vec.load('word2vec_model')

In [15]:
word2vec_embd_w = np.zeros((vocab_size, embd_dim))
for word, i in w2i.items():
  if word in embed_model.wv.vocab:
        embedding_vector =embed_model[word]
  
        # words not found in embedding index will be all-zeros.
        word2vec_embd_w[i] = embedding_vector

  after removing the cwd from sys.path.


In [16]:
from keras.layers import LSTM
def Net(vocab_size, embd_size, sentence_maxlen, glove_embd_w):
    sentence = Input((sentence_maxlen,), name='SentenceInput')
    
    # embedding
    embd_layer = Embedding(input_dim=vocab_size, 
                           output_dim=embd_size, 
                           weights=[word2vec_embd_w], 
                           trainable=False,
                           name='shared_embd')
    embd_sentence = embd_layer(sentence)
    embd_sentence = Permute((2,1))(embd_sentence)
    embd_sentence = Lambda(lambda x: K.expand_dims(x, -1))(embd_sentence)
    
    # cnn
    cnn = Conv2D(1, 
                 kernel_size=(5, sentence_maxlen),
                 activation='relu')(embd_sentence)
    cnn =  Lambda(lambda x: K.sum(x, axis=3))(cnn)
    cnn = MaxPooling1D(3)(cnn)
    cnn = Lambda(lambda x: K.sum(x, axis=2))(cnn)
    
    hidden1=Dense(400,activation="relu")(cnn)
    hidden2=Dense(300,activation="relu")(hidden1)
    hidden3=Dense(200,activation="relu")(hidden2)
    hidden4=Dense(150,activation="relu")(hidden3)
    out = Dense(len(labels), activation='sigmoid')(hidden4)
    
    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model = Model(inputs=sentence, outputs=out, name='sentence_claccification')
    model.compile(optimizer=sgd, loss='binary_crossentropy',metrics=["accuracy", "binary_accuracy",
    "categorical_accuracy",])
    return model

model = Net(vocab_size, embd_dim, sentence_maxlen,word2vec_embd_w)
print(model.summary())


Model: "sentence_claccification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
SentenceInput (InputLayer)   [(None, 300)]             0         
_________________________________________________________________
shared_embd (Embedding)      (None, 300, 300)          67603500  
_________________________________________________________________
permute (Permute)            (None, 300, 300)          0         
_________________________________________________________________
lambda (Lambda)              (None, 300, 300, 1)       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 296, 1, 1)         1501      
_________________________________________________________________
lambda_1 (Lambda)            (None, 296, 1)            0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 98, 1) 

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5) # Model stop training after 5 epoch where validation loss didnt decrease
mc = ModelCheckpoint('best_cnn_4fc.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True) #You save model weight at the epoch where validation loss is minimal
model.fit(X_train, y_train, batch_size=32,epochs=250,verbose=1,validation_data=(X_val, y_val),callbacks=[es,mc])#you can run for 1000 epoch btw model will stop after 5 epoch without better validation loss

Epoch 1/250

Epoch 00001: val_loss improved from inf to 0.11000, saving model to best_cnn_4fc.h5
Epoch 2/250

Epoch 00002: val_loss improved from 0.11000 to 0.10752, saving model to best_cnn_4fc.h5
Epoch 3/250

Epoch 00003: val_loss improved from 0.10752 to 0.10438, saving model to best_cnn_4fc.h5
Epoch 4/250

Epoch 00004: val_loss improved from 0.10438 to 0.09951, saving model to best_cnn_4fc.h5
Epoch 5/250

Epoch 00005: val_loss improved from 0.09951 to 0.09384, saving model to best_cnn_4fc.h5
Epoch 6/250

Epoch 00006: val_loss improved from 0.09384 to 0.08898, saving model to best_cnn_4fc.h5
Epoch 7/250

Epoch 00007: val_loss improved from 0.08898 to 0.08549, saving model to best_cnn_4fc.h5
Epoch 8/250

Epoch 00008: val_loss improved from 0.08549 to 0.08243, saving model to best_cnn_4fc.h5
Epoch 9/250

Epoch 00009: val_loss improved from 0.08243 to 0.07998, saving model to best_cnn_4fc.h5
Epoch 10/250

Epoch 00010: val_loss improved from 0.07998 to 0.07861, saving model to best_cnn_

<tensorflow.python.keras.callbacks.History at 0x7f0392988630>

# ***If the model is generated before:***

In [17]:
model = load_model('best_cnn_4fc_with_binary.h5')
# model.save('best_cnn_4fc_with_binary.h5')

In [18]:
pred=model.predict(X_test)
# For evaluation: If the probability > 0.5 you can say that it belong to the class.

In [19]:
print(pred[0])#example

[1.28691900e-05 1.35993958e-03 8.81868327e-05 3.08060407e-06
 1.21343424e-08 2.15166278e-06 5.58876200e-05 6.13067168e-05
 6.20292553e-07 2.01195478e-04 5.23518429e-06 7.95147004e-11
 6.75920386e-09 1.72682011e-08 9.49173398e-08 7.38362019e-07
 3.91908571e-07 1.81894898e-02 1.27643347e-04 3.57568264e-04
 2.41607428e-04 6.11127052e-06 1.60775457e-07 1.02964044e-03
 5.03659248e-04 1.49760067e-01 7.94296375e-06 1.69244927e-07
 3.16571482e-06 2.19076872e-04 1.47283077e-04 1.31869912e-02
 2.23560278e-06 1.95117846e-05 2.65032053e-04 2.10514244e-07
 4.25242570e-06 7.16469913e-07 6.15581084e-05 4.05709716e-06
 6.52370930e-01 3.81792379e-05 9.69902612e-05 2.00494384e-07
 2.13137269e-03 4.44282705e-06 4.90285075e-08 3.38226557e-04
 1.03130937e-03 1.24948610e-06 3.26928102e-08 1.00508532e-06
 5.09917736e-04 1.78045312e-08 2.34452413e-09 1.22595538e-05
 1.09079480e-03 2.48280941e-07 1.48249567e-02 1.37757834e-05
 7.76499510e-04 3.41236591e-03 2.26378441e-04 6.20567153e-05
 2.09715245e-06 3.802425

In [64]:
y_pred=[]
measure = 9 * (np.mean(pred[0]) + 0.5*np.sqrt(np.var(pred[0])))
for l in pred:
  temp=[]
  for value in l:
    if value>= measure:
      temp.append(1)
    else:
      temp.append(0)
  y_pred.append(temp)


In [65]:
measure

0.456586841493845

In [66]:
from sklearn.metrics import classification_report,accuracy_score

print("accuracy=",accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

accuracy= 0.3587662337662338
              precision    recall  f1-score   support

           0       0.53      0.50      0.51       103
           1       0.45      0.32      0.37       151
           2       0.50      0.45      0.47       141
           3       0.42      0.19      0.26        26
           4       0.96      0.91      0.93        97
           5       1.00      0.07      0.13        14
           6       0.49      0.41      0.44       137
           7       0.83      0.28      0.42        36
           8       0.30      0.14      0.19        22
           9       0.70      0.13      0.22       122
          10       0.63      0.38      0.47       152
          11       0.96      0.96      0.96        77
          12       0.99      0.98      0.98       129
          13       0.50      0.14      0.22        14
          14       0.83      0.28      0.42        18
          15       0.71      0.30      0.42       148
          16       0.72      0.60      0.65       12

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
