In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
os.chdir('/content/drive/MyDrive/sharif/DeepLearning/ipython(guide)')

In [5]:
import numpy as np
import codecs
import os
import random
import pandas
from keras import backend as K
from keras.models import Model
from keras.layers.embeddings import Embedding
from keras.layers import Input, Dense, Lambda, Permute, Dropout
from keras.layers import Conv2D, MaxPooling1D , Flatten , Softmax
from keras.optimizers import SGD
import ast
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import gensim
from keras.models import load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [6]:
limit_number = 750
data = pandas.read_csv('../Data/limited_to_'+str(limit_number)+'.csv',index_col=0,converters={'body': eval})
data = data.dropna().reset_index(drop=True)
X = data["body"].values.tolist()
y = pandas.read_csv('../Data/limited_to_'+str(limit_number)+'.csv')
labels = []
tag=[]
for item in y['tag']:
  labels += [i for i in re.sub('\"|\[|\]|\'| |=','',item.lower()).split(",") if i!='' and i!=' ']
  tag.append([i for i in re.sub('\"|\[|\]|\'| |=','',item.lower()).split(",") if i!='' and i!=' '])
labels = list(set(labels))
mlb = MultiLabelBinarizer()
Y=mlb.fit_transform(tag)

In [7]:
len(labels)

78

In [8]:
sentence_maxlen = max(map(len, (d for d in X)))
print('sentence maxlen', sentence_maxlen)

sentence maxlen 300


In [9]:
freq_dist = pandas.read_csv('../Data/FreqDist_sorted.csv',index_col=False)
vocab=[]
for item in freq_dist["word"]:
  try:
    word=re.sub(r"[\u200c-\u200f]","",item.replace(" ",""))
    vocab.append(word)
  except:
    pass
  
print(vocab[10])

زبان


In [10]:
vocab = sorted(vocab)
vocab_size = len(vocab)

In [11]:
print('vocab size', len(vocab))
w2i = {w:i for i,w in enumerate(vocab)}
# i2w = {i:w for i,w in enumerate(vocab)}
print(w2i["زبان"])

vocab size 225345
129280


In [12]:
def vectorize(data, sentence_maxlen, w2i):
    vec_data = []
    
    for d in data:
       
        vec = [w2i[w] for w in d if w in w2i]
        pad_len = max(0, sentence_maxlen - len(vec))
        vec += [0] * pad_len
        vec_data.append(vec)
        # print(d)
        
    vec_data = np.array(vec_data)
    
    return vec_data

vecX = vectorize(X, sentence_maxlen, w2i)
vecY=Y

In [13]:
X_train, X_test, y_train, y_test = train_test_split(vecX, vecY, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)
print('train: ', X_train.shape , '\ntest: ', X_test.shape , '\nval: ', X_val.shape ,"\ny_tain:",y_train.shape )
# print(vecX[0])

train:  (12935, 300) 
test:  (4312, 300) 
val:  (4312, 300) 
y_tain: (12935, 78)


In [14]:
embd_dim = 300


# ***If the word2vec model is not generated before, we should run the next block.***

In [15]:
# embed_model = gensim.models.Word2Vec(X, size=embd_dim, window=5, min_count=5)
# embed_model.save('word2vec_model')

# ***Otherwise, we can run the next block.***

In [16]:
embed_model=gensim.models.Word2Vec.load('word2vec_model')

In [17]:
word2vec_embd_w = np.zeros((vocab_size, embd_dim))
for word, i in w2i.items():
  if word in embed_model.wv.vocab:
        embedding_vector =embed_model[word]
  
        # words not found in embedding index will be all-zeros.
        word2vec_embd_w[i] = embedding_vector

  after removing the cwd from sys.path.


In [18]:
def Net(vocab_size, embd_size, sentence_maxlen, glove_embd_w):
    sentence = Input((sentence_maxlen,), name='SentenceInput')
    
    # embedding
    embd_layer = Embedding(input_dim=vocab_size, 
                           output_dim=embd_size, 
                           weights=[word2vec_embd_w], 
                           trainable=False,
                           name='shared_embd')
    embd_sentence = embd_layer(sentence)
    embd_sentence = Permute((2,1))(embd_sentence)
    embd_sentence = Lambda(lambda x: K.expand_dims(x, -1))(embd_sentence)
    
    # cnn

    cnn_1 = Conv2D(1,name='cnn_1', 
                 kernel_size=(5, sentence_maxlen),
                 activation='relu')(embd_sentence)

    cnn_2 =  Lambda(lambda x: K.sum(x, axis=3),name='cnn_2')(cnn_1)
    max_pool_1 = MaxPooling1D(3,name='max_pool_1')(cnn_2)
    # bn_1 = BatchNormalization(name='bn_1')(max_pool_1)
    cnn_3 = Lambda(lambda x: K.sum(x, axis=2),name='cnn_3')(max_pool_1)
    # bn_2 = BatchNormalization(name='bn_2')(cnn_3)

    flatten = Flatten(name='flatten')(cnn_3)
    dense_1 = Dense(400,activation="relu",name='dense_1')(flatten)
    dense_2 = Dense(300,activation="relu",name='dense_2')(dense_1)
    dense_3 = Dense(200,activation="relu",name='dense_3')(dense_2)
    dense_4 = Dense(len(labels),activation="sigmoid",name='dense_4')(dense_3)
    out = Softmax(name='Soft_max_out')(dense_4)
    

    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model = Model(inputs=sentence, outputs=out, name='sentence_claccification')
    model.compile(optimizer=sgd, loss='binary_crossentropy', metrics=["accuracy"]) 
    return model


model = Net(vocab_size, embd_dim, sentence_maxlen,word2vec_embd_w)
print(model.summary())


Model: "sentence_claccification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
SentenceInput (InputLayer)   [(None, 300)]             0         
_________________________________________________________________
shared_embd (Embedding)      (None, 300, 300)          67603500  
_________________________________________________________________
permute (Permute)            (None, 300, 300)          0         
_________________________________________________________________
lambda (Lambda)              (None, 300, 300, 1)       0         
_________________________________________________________________
cnn_1 (Conv2D)               (None, 296, 1, 1)         1501      
_________________________________________________________________
cnn_2 (Lambda)               (None, 296, 1)            0         
_________________________________________________________________
max_pool_1 (MaxPooling1D)    (None, 98, 1) 

In [19]:
# model.fit(X_train, y_train,
#             batch_size=32,
#             epochs=5,
#             validation_data=(X_val, y_val))

In [None]:
# model = Net(vocab_size, embd_dim, sentence_maxlen,word2vec_embd_w)

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50) # Model stop training after 50 epoch where validation loss didnt decrease
mc = ModelCheckpoint('best_cnn_4fc.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True) #You save model weight at the epoch where validation loss is minimal
model.fit(X_train, y_train, batch_size=32,epochs=20,verbose=1,validation_data=(X_val, y_val),callbacks=[es,mc])#you can run for 1000 epoch btw model will stop after 50 epoch without better validation loss


Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.10815, saving model to best_cnn_4fc.h5
Epoch 2/20

Epoch 00002: val_loss improved from 0.10815 to 0.10787, saving model to best_cnn_4fc.h5
Epoch 3/20

Epoch 00003: val_loss improved from 0.10787 to 0.10765, saving model to best_cnn_4fc.h5
Epoch 4/20

Epoch 00004: val_loss improved from 0.10765 to 0.10743, saving model to best_cnn_4fc.h5
Epoch 5/20

Epoch 00005: val_loss improved from 0.10743 to 0.10724, saving model to best_cnn_4fc.h5
Epoch 6/20

Epoch 00006: val_loss improved from 0.10724 to 0.10706, saving model to best_cnn_4fc.h5
Epoch 7/20

Epoch 00007: val_loss improved from 0.10706 to 0.10693, saving model to best_cnn_4fc.h5
Epoch 8/20

Epoch 00008: val_loss improved from 0.10693 to 0.10677, saving model to best_cnn_4fc.h5
Epoch 9/20

Epoch 00009: val_loss improved from 0.10677 to 0.10662, saving model to best_cnn_4fc.h5
Epoch 10/20

Epoch 00010: val_loss improved from 0.10662 to 0.10654, saving model to best_cnn_4fc.h5
Epo

<tensorflow.python.keras.callbacks.History at 0x7ff6d04985f8>

In [20]:
# model.save('CNN_2_just_accuracy.h5')

# from keras.models import load_model
model = load_model('CNN_2_just_accuracy.h5')

Evaluation

In [21]:
pred=model.predict(X_test)
# For evaluation: If the probability > 0.5 you can say that it belong to the class.

In [22]:
print(pred[0])#example

[0.01066499 0.01066513 0.01066499 0.01066629 0.0106651  0.01066499
 0.01066499 0.01066507 0.01066499 0.01066519 0.01066499 0.01067633
 0.01066506 0.01066532 0.01066499 0.01066868 0.01066499 0.02899032
 0.01069469 0.01066737 0.0222855  0.01086349 0.01066499 0.01434
 0.010876   0.02647874 0.01066499 0.01068291 0.010665   0.02890162
 0.01066499 0.01614735 0.01066606 0.01066557 0.01066499 0.01066499
 0.01066499 0.01067968 0.010665   0.01066499 0.02898837 0.01070401
 0.01066499 0.01066504 0.02717279 0.01066499 0.01066499 0.0108489
 0.01069822 0.01067162 0.01066499 0.01066499 0.01066499 0.01066534
 0.0106772  0.01066499 0.01073705 0.01083923 0.02172851 0.01066914
 0.02899045 0.01066499 0.01074179 0.01070234 0.01068918 0.01066502
 0.01066499 0.01523297 0.01067473 0.01066499 0.01066504 0.02898942
 0.01712363 0.01066499 0.01066499 0.01075928 0.01080705 0.01066747]


In [37]:
y_pred=[]
for l in pred:
  temp=[]
  for value in l:
    if value>= np.mean(pred[0]) + .75*np.sqrt(np.var(pred[0])):
      temp.append(1)
    else:
      temp.append(0)
  y_pred.append(temp)

In [38]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.08      0.72      0.15       111
           1       0.11      0.82      0.20       153
           2       0.15      0.88      0.26       144
           3       0.00      0.00      0.00        17
           4       0.33      0.85      0.47        65
           5       0.00      0.00      0.00        13
           6       0.11      0.83      0.19       150
           7       0.15      0.24      0.18        34
           8       0.00      0.00      0.00        24
           9       0.06      0.70      0.11       102
          10       0.09      0.82      0.16       141
          11       0.26      1.00      0.42        82
          12       0.64      0.96      0.77       139
          13       0.00      0.00      0.00        21
          14       0.00      0.00      0.00        19
          15       0.12      0.73      0.20       169
          16       0.13      0.85      0.23       131
          17       0.10    

  _warn_prf(average, modifier, msg_start, len(result))
