# CNN with word embeddings

### Importing prerequisite libraries

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
import pandas  as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

### Loading datasets and dropping nulls

In [3]:
data  = pd.read_csv('data_raw.csv',sep=',',names=['Msg','Tag'])

In [44]:
data_x=data["Msg"]
data_y=data["Tag"]

In [45]:
vocab_size = 10000

In [46]:
data_x_n = data_x.to_numpy()

array(['theDT thingNN disgustJJ whiteJJ womanNN groidJJ whiteJJ womanNN dragVBZ whiteJJ childNN filthNN',
       'americanJJ actNN likeIN knowJJ talkNN',
       'alsoRB intrestRB checkVB webpagNN infoJJ europeanJJ americanJJ townNN buildNN',
       ..., 'NoDT truthNN 88whiteCD powerwhitNN victorywhitNN pride88NN',
       '4CD cyclindNN motorcyclNN historNN vehiclNN laurinJJ klementNN 18991903CD youtubNN historNN vehiclNN torpedoNN 1909CD youtubNN historNN vehiclNN torpedoNN 1909CD httpthekneeslidercomimages2012rightsidejpgNN handlebarNN cameraNN mountNN rideNN videoNN',
       'IPRP thoughtVBD IPRP leavVBP noteJJ wishJJ southernJJ gentlemenNNS ladiVBP happiNN robertNN edwardJJ leeNN dayNN'],
      dtype=object)

### Finding max sentence length

In [47]:
def max_sen_length(data):
    mx_len = 0
    for sen in data:
        words = sen.split()
        if len(words) > mx_len:
            mx_len = len(words)
    return mx_len

In [48]:
sen_len = max_sen_length(data_x_n)

### One hot encoding

In [49]:
onehot_enc = [one_hot(sen, vocab_size) for sen in data_x_n]

### Padding seuqnces to the same length

In [50]:
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)

In [53]:
print(embed_repr[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0 9354
 6098 3888  619 8431 1096  619 8431 4473  619 7475 7341]


### Cnn model

In [56]:
embedding_vector_features = 100

In [57]:
model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=sen_len))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy'])

In [58]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 151, 32)           320000    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 151, 32)           3104      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 75, 32)            0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 2400)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 250)               600250    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 251       
Total params: 923,605
Trainable params: 923,605
Non-trainable params: 0
________________________________________________

### Sampling

In [None]:
#print(Counter(data_raw_y))
#oversample_raw = SMOTE()
#x, y = oversample_raw.fit_sample(embed_repr, data_y)
#print(Counter(y_raw))

In [None]:
x, y = oversample_raw.fit_sample(embed_repr, data_y)

### Train-test split

In [63]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

### Model fitting

In [64]:
model.fit(x_train,pd.get_dummies(y_train), validation_data=(x_test,pd.get_dummies(y_test)),epochs=10,batch_size=64)

Train on 8750 samples, validate on 2188 samples
Epoch 1/10
8750/8750 - 5s - loss: 0.3820 - acc: 0.8649 - val_loss: 0.3360 - val_acc: 0.8720
Epoch 2/10
8750/8750 - 5s - loss: 0.2299 - acc: 0.9109 - val_loss: 0.3694 - val_acc: 0.8757
Epoch 3/10
8750/8750 - 5s - loss: 0.0881 - acc: 0.9703 - val_loss: 0.4745 - val_acc: 0.8629
Epoch 4/10
8750/8750 - 4s - loss: 0.0315 - acc: 0.9917 - val_loss: 0.6163 - val_acc: 0.8551
Epoch 5/10
8750/8750 - 5s - loss: 0.0158 - acc: 0.9965 - val_loss: 0.7554 - val_acc: 0.8510
Epoch 6/10
8750/8750 - 4s - loss: 0.0090 - acc: 0.9978 - val_loss: 0.8908 - val_acc: 0.8624
Epoch 7/10
8750/8750 - 5s - loss: 0.0071 - acc: 0.9985 - val_loss: 0.9552 - val_acc: 0.8615
Epoch 8/10
8750/8750 - 5s - loss: 0.0056 - acc: 0.9989 - val_loss: 0.9203 - val_acc: 0.8464
Epoch 9/10
8750/8750 - 4s - loss: 0.0052 - acc: 0.9989 - val_loss: 1.1380 - val_acc: 0.8661
Epoch 10/10
8750/8750 - 4s - loss: 0.0036 - acc: 0.9993 - val_loss: 1.0315 - val_acc: 0.8304


<tensorflow.python.keras.callbacks.History at 0x7f4bcb41ce10>

## Prediction

In [None]:
pred = model_raw.predict(x_test)
y_pred = list()
for i in range(len(pred)):
    y_pred.append(np.argmax(pred[i]))

## Model accuracies

### Without Sampling

In [None]:
print("Confusion matrix :\n",confusion_matrix(y_test, y_pred))
print("Accuracy score   : ",accuracy_score(y_test, y_pred))
print("F1 score         : ",f1_score(y_test, y_pred, average='weighted'))
print("Recall           : ",recall_score(y_test, y_pred, average = 'macro'))
print("Precision        : ",precision_score(y_test, y_pred, average='macro'))

### With Sampling

In [None]:
print("Confusion matrix :\n",confusion_matrix(y_test, y_pred))
print("Accuracy score   : ",accuracy_score(y_test, y_pred))
print("F1 score         : ",f1_score(y_test, y_pred, average='weighted'))
print("Recall           : ",recall_score(y_test, y_pred, average = 'macro'))
print("Precision        : ",precision_score(y_test, y_pred, average='macro'))