# CNN with word embeddings

### Importing prerequisite libraries

In [74]:
from tensorflow.keras.preprocessing.text import Tokenizer, one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, Embedding, LSTM, GRU, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.models import Sequential
import pandas  as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

### Loading datasets and dropping nulls

In [73]:
data = pd.read_csv('data_raw.csv',sep=',',names=['Msg','Tag'])

data.head()

Unnamed: 0,Msg,Tag
0,jaydillz my babies pussy is too tight today t...,1
1,seymourblanco they game is over fuck yall bit...,1
2,can you let me stretch that pussy out or nahhh,1
3,don t mind that twinkies are gonna be gone ver...,2
4,cnt nobody be mad at who he choose to be with ...,1


In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     24783 non-null  object
 1   Tag     24783 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 387.4+ KB


In [76]:
data.dropna(inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24783 entries, 0 to 24782
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Msg     24783 non-null  object
 1   Tag     24783 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 580.9+ KB


In [77]:
data.head()

Unnamed: 0,Msg,Tag
0,jaydillz my babies pussy is too tight today t...,1
1,seymourblanco they game is over fuck yall bit...,1
2,can you let me stretch that pussy out or nahhh,1
3,don t mind that twinkies are gonna be gone ver...,2
4,cnt nobody be mad at who he choose to be with ...,1


In [78]:
data_x=data["Msg"]
data_y=data["Tag"]


In [79]:
vocab_size = 10000

In [80]:
data_x_n = data_x.to_numpy()
data_x_n



array([' jaydillz my babies pussy is too tight today t co if k v ro',
       ' seymourblanco they game is over fuck yall bitches amp yall attitudes t co rlrnybfedt ',
       'can you let me stretch that pussy out or nahhh', ...,
       'need some hispanic pussy',
       ' o mygotti you have a girlfriend stop asking these hoes to be your bestfriend ',
       ' kingtunchi jd told me i m to player to be with one bitch'],
      dtype=object)

### Finding max sentence length

In [81]:
def max_sen_length(data):
    mx_len = 0
    for sen in data:
        words = sen.split()
        if len(words) > mx_len:
            mx_len = len(words)
    return mx_len

In [82]:
sen_len = max_sen_length(data_x_n)



### One hot encoding

In [83]:
onehot_enc = [one_hot(sen, vocab_size) for sen in data_x_n]



### Padding seuqnces to the same length

In [84]:
embed_repr = pad_sequences(onehot_enc, padding='pre', maxlen=sen_len)

### Cnn model

In [85]:
embedding_vector_features = 100

In [86]:
# model = Sequential()
# model.add(Embedding(vocab_size, 32, input_length=sen_len))
# model.add(Conv1D(32, 3, padding='same', activation='relu'))
# model.add(MaxPooling1D())
# model.add(Flatten())
# model.add(Dense(250, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

model = Sequential()
model.add(Embedding(vocab_size, 32, input_length=sen_len))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(64, 2, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [87]:
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 34, 32)            320000    
_________________________________________________________________
conv1d_17 (Conv1D)           (None, 34, 32)            3104      
_________________________________________________________________
max_pooling1d_17 (MaxPooling (None, 17, 32)            0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 17, 64)            4160      
_________________________________________________________________
max_pooling1d_18 (MaxPooling (None, 8, 64)             0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 512)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 64)               

### Train-test split

In [88]:
x_train, x_test, y_train, y_test = train_test_split(embed_repr, data_y, test_size=0.2, random_state=4)


### Model fitting

In [89]:
model.fit(x_train,y_train, validation_data=(x_test,y_test),epochs=10, verbose=2)

Epoch 1/10
620/620 - 5s - loss: -3.2943e+07 - accuracy: 0.7731 - val_loss: -2.6382e+08 - val_accuracy: 0.7755
Epoch 2/10
620/620 - 5s - loss: -4.9260e+09 - accuracy: 0.7740 - val_loss: -1.8062e+10 - val_accuracy: 0.7755
Epoch 3/10
620/620 - 5s - loss: -7.1606e+10 - accuracy: 0.7740 - val_loss: -1.7834e+11 - val_accuracy: 0.7755
Epoch 4/10
620/620 - 5s - loss: -4.1312e+11 - accuracy: 0.7740 - val_loss: -8.1465e+11 - val_accuracy: 0.7755
Epoch 5/10
620/620 - 5s - loss: -1.4628e+12 - accuracy: 0.7740 - val_loss: -2.4851e+12 - val_accuracy: 0.7755
Epoch 6/10
620/620 - 5s - loss: -3.8796e+12 - accuracy: 0.7740 - val_loss: -6.0817e+12 - val_accuracy: 0.7755
Epoch 7/10
620/620 - 5s - loss: -8.4095e+12 - accuracy: 0.7740 - val_loss: -1.2507e+13 - val_accuracy: 0.7755
Epoch 8/10
620/620 - 5s - loss: -1.6550e+13 - accuracy: 0.7740 - val_loss: -2.3120e+13 - val_accuracy: 0.7755
Epoch 9/10
620/620 - 5s - loss: -2.9074e+13 - accuracy: 0.7740 - val_loss: -3.9434e+13 - val_accuracy: 0.7755
Epoch 10/1

<tensorflow.python.keras.callbacks.History at 0x7f0e53880780>

## Model accuracies

In [90]:
y_pred = model.predict_classes(x_test)

In [91]:
confusion_matrix(y_test, y_pred)

array([[   0,  267,    0],
       [   0, 3844,    0],
       [   0,  846,    0]])

In [98]:
accuracy_score(y_test, y_pred)

0.7754690336897317

In [99]:
f1_score(y_test, y_pred, average="macro")

0.2911790326856797

In [100]:
precision_score(y_test, y_pred, average="macro")

  _warn_prf(average, modifier, msg_start, len(result))


0.25848967789657723

In [101]:
recall_score(y_test, y_pred, average="macro")

0.3333333333333333