### Problem Statement

  - Create a simple neural network using TensorFlow/Keras for a binary classification problem with textual data.


### Loading the Dataset

In [4]:
import pandas as pd

df = pd.read_csv('spam.csv')
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

### Balancing the Dataset and Downsampling


In [6]:
df_spam = df[df['Category']== 'spam']
df_ham = df[df['Category']== 'ham']

In [7]:
df_ham_downsampled = df_ham.sample(df_spam.shape[0])

df_ham_downsampled.shape

(747, 2)

In [8]:
df_balance = pd.concat([df_spam, df_ham_downsampled])

df_balance.shape

(1494, 2)

### Label Encoding of Target

In [9]:
df_balance['spam'] = df_balance['Category'].apply(lambda x: 1 if x=='spam' else 0)
df_balance.tail(5)

Unnamed: 0,Category,Message,spam
5327,ham,Wishing you a wonderful week.,0
4663,ham,Ladies first and genus second k .,0
4050,ham,Yeah that's the impression I got,0
4973,ham,I'm fine. Hope you are good. Do take care.,0
1682,ham,Y lei?,0


### Tokenzization and Vectorization

In [11]:
X = df_balance['Message']
y = df_balance['spam']

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=100)

In [22]:
X[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,   14,  181,   16,   13,    3,  345,  460,    1,
         70,  923,  783,  298,  924, 1145,  323, 1146,   27,  923,    1,
        925,    1,  143,  181,  512,  424,   22,  157,   68,  265,  158,
       1477], dtype=int32)

### Splitting the Dataset

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)

print(y_train.shape, y_test.shape)

(1195, 100) (299, 100)
(1195,) (299,)


### Model Building

- Architcture

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

- Compilation and Training

In [16]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [19]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Model Testing and Predictions

In [18]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Loss: 13.431641578674316, Accuracy: 0.7290970087051392


In [27]:
sample_text = ["Congratulations! You've won a free ticket to Bahamas. Claim now!"]
sample_seq = tokenizer.texts_to_sequences(sample_text)
sample_pad = pad_sequences(sample_seq, maxlen=100)

predicted_label = model.predict(sample_pad)

predicted_label = (predicted_label > 0.5).astype("int32")

print(predicted_label)

[[0]]


<hr><hr>