# CNN with word embeddings

### Importing prerequisite libraries

In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
import pandas  as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from collections import Counter
from imblearn.over_sampling import SMOTE
import numpy as np

### Loading datasets and dropping nulls

In [7]:
data  = pd.read_csv('data_raw.csv',sep=',',names=['Msg','Tag'])

In [8]:
data_x=data["Msg"]
data_y=data["Tag"]

In [9]:
vocab_size = 10000

In [10]:
data_x_n = data_x.to_numpy()

### Finding max sentence length

In [11]:
def max_sen_length(data):
    mx_len = 0
    for sen in data:
        words = sen.split()
        if len(words) > mx_len:
            mx_len = len(words)
    return mx_len

In [12]:
sen_len = max_sen_length(data_x_n)

### One hot encoding

In [13]:
onehot_enc = [one_hot(sen, vocab_size) for sen in data_x_n]

### Padding seuqnces to the same length

In [14]:
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)

In [53]:
print(embed_repr[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0 9354
 6098 3888  619 8431 1096  619 8431 4473  619 7475 7341]


### Cnn model

In [15]:
embedding_vector_features = 100

In [16]:
model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=sen_len))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 34, 32)            320000    
_________________________________________________________________
conv1d (Conv1D)              (None, 34, 32)            3104      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 17, 32)            0         
_________________________________________________________________
flatten (Flatten)            (None, 544)               0         
_________________________________________________________________
dense (Dense)                (None, 250)               136250    
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 753       
Total params: 460,107
Trainable params: 460,107
Non-trainable params: 0
__________________________________________________

### Sampling

In [31]:
print(Counter(data_y))
oversample = SMOTE()
x, y = oversample.fit_sample(embed_repr, data_y)
print(Counter(y))

Counter({1: 19190, 2: 4163, 0: 1430})
Counter({1: 19190, 2: 19190, 0: 19190})


In [18]:
#x, y = embed_repr, data_y

### Train-test split

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

### Model fitting

In [33]:
model.fit(x_train,pd.get_dummies(y_train), validation_data=(x_test,pd.get_dummies(y_test)),epochs=10,batch_size=64)

Train on 46056 samples, validate on 11514 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f51e8181160>

## Prediction

In [34]:
pred = model.predict(x_test)
y_pred = list()
for i in range(len(pred)):
    y_pred.append(np.argmax(pred[i]))

## Model accuracies

### Without Sampling

In [25]:
print("Confusion matrix :\n",confusion_matrix(y_test, y_pred))
print("Accuracy score   : ",accuracy_score(y_test, y_pred))
print("F1 score         : ",f1_score(y_test, y_pred, average='weighted'))
print("Recall           : ",recall_score(y_test, y_pred, average = 'macro'))
print("Precision        : ",precision_score(y_test, y_pred, average='macro'))

Confusion matrix :
 [[  67  172   28]
 [ 146 3589  109]
 [   8  231  607]]
Accuracy score   :  0.8599959653015937
F1 score         :  0.8554506921619972
Recall           :  0.6340310902064002
Precision        :  0.672691910686876


### With Sampling

In [35]:
print("Confusion matrix :\n",confusion_matrix(y_test, y_pred))
print("Accuracy score   : ",accuracy_score(y_test, y_pred))
print("F1 score         : ",f1_score(y_test, y_pred, average='weighted'))
print("Recall           : ",recall_score(y_test, y_pred, average = 'macro'))
print("Precision        : ",precision_score(y_test, y_pred, average='macro'))

Confusion matrix :
 [[1970  209 1654]
 [  79 3697   86]
 [1377  166 2276]]
Accuracy score   :  0.689855827688032
F1 score         :  0.6859800472073684
Recall           :  0.6890670963361982
Precision        :  0.683218441363547
