# CNN with word embeddings

### Importing prerequisite libraries

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
import pandas  as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from collections import Counter
from imblearn.over_sampling import SMOTE
import numpy as np



### Loading datasets and dropping nulls

In [3]:
data  = pd.read_csv('data_raw.csv',sep=',',names=['Msg','Tag'])

In [4]:
data_x=data["Msg"]
data_y=data["Tag"]

In [5]:
vocab_size = 10000

In [6]:
data_x_n = data_x.to_numpy()

### Finding max sentence length

In [7]:
def max_sen_length(data):
    mx_len = 0
    for sen in data:
        words = sen.split()
        if len(words) > mx_len:
            mx_len = len(words)
    return mx_len

In [8]:
sen_len = max_sen_length(data_x_n)

### One hot encoding

In [9]:
onehot_enc = [one_hot(sen, vocab_size) for sen in data_x_n]

### Padding seuqnces to the same length

In [10]:
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)

In [11]:
print(embed_repr[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0 6626 4466 2142 1395 6340 4574 2991 8765
 6367   44  432 8155 2742 8654]


### Cnn model

In [12]:
embedding_vector_features = 100

In [33]:
# model = Sequential()
# model.add(Embedding(vocab_size, 32, input_length=sen_len))
# model.add(Conv1D(32, 3, padding='same', activation='relu'))
# model.add(MaxPooling1D())
# model.add(Flatten())
# model.add(Dense(250, activation='relu'))
# model.add(Dense(3, activation='softmax'))
# model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=sen_len))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(64, 2, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

In [34]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 34, 32)            320000    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 34, 32)            3104      
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 17, 32)            0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 17, 64)            4160      
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 8, 64)             0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 512)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 64)               

### Sampling

In [35]:
print(Counter(data_y))
oversample = SMOTE()
x, y = oversample.fit_sample(embed_repr, data_y)
print(Counter(y))

Counter({1: 19190, 2: 4163, 0: 1430})




Counter({1: 19190, 2: 19190, 0: 19190})


### Train-test split

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

### Model fitting

In [37]:
model.fit(x_train,pd.get_dummies(y_train), validation_data=(x_test,pd.get_dummies(y_test)),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f0e78f55be0>

## Prediction

In [38]:
pred = model.predict(x_test)
y_pred = list()
for i in range(len(pred)):
    y_pred.append(np.argmax(pred[i]))

## Model accuracies

In [39]:
print("Confusion matrix :\n",confusion_matrix(y_test, y_pred))
print("Accuracy score   : ",accuracy_score(y_test, y_pred))
print("F1 score         : ",f1_score(y_test, y_pred, average='weighted'))
print("Recall           : ",recall_score(y_test, y_pred, average = 'macro'))
print("Precision        : ",precision_score(y_test, y_pred, average='macro'))

Confusion matrix :
 [[1942  235 1656]
 [ 101 3588  173]
 [1428  160 2231]]
Accuracy score   :  0.674048983845753
F1 score         :  0.6716751286712531
Recall           :  0.6732964661231109
Precision        :  0.6699429506310465
