In [14]:
import numpy as np 
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer
from sklearn.metrics import classification_report, f1_score, accuracy_score
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import Sequential

In [5]:
# Segment data 
# Need both train and test csv
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train_set = train['text']
test_set = test['text']
train_targets = train['target']

In [6]:
# Split data randomly
X_train, X_test, y_train, y_test = train_test_split(train_set, train_targets, random_state=0)

In [7]:
# Visualize data
print(f"The training dataset contains {len(X_train)} messages.")
print(f"The test dataset contains {len(X_test)} messages.")

The training dataset contains 5709 messages.
The test dataset contains 1904 messages.
There are 2 features in the data, either a 0 or 1


In [8]:
# Create a count vectorizer and bag of words representaitons 
counter = CountVectorizer(stop_words = 'english', ngram_range = (1,3))
X_train_bow = counter.fit_transform(X_train).toarray()
X_test_bow = counter.fit_transform(X_test).toarray()

In [9]:
# Visualize features, need the input dimension for NN
print(f'The training set has: {X_train_bow.shape[0]:5d} samples and {X_train_bow.shape[1]} features.')
print(f'The test set has:     {X_test_bow.shape[0]:5d} samples and {X_test_bow.shape[1]} features.')

The training set has:  5709 samples and 97185 features.
The test set has:      1904 samples and 36518 features.


In [18]:
# Haven't spent much time adjusting parameters, this was a first guess but it worked really well
model = Sequential()
model.add(Dense(1000, input_dim=97185, activation='relu'))
model.add(Dense(750, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(500, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation = 'sigmoid'))

SyntaxError: ignored

In [16]:
# 98% Accuracy with 5 epochs
n_epochs = 5
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train_bow, y_train, batch_size=32, epochs=n_epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
# Model using test input dim, could combine it into one model with variable input but having two models is nice for faster training on this set.
model = Sequential()
model.add(Dense(1000, input_dim=36518, activation='relu'))
model.add(Dense(750, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(500, activation='relu'))
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation = 'sigmoid'))

In [20]:
# 99% Accuracy with 5 epochs
n_epochs = 5
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_test_bow, y_test, batch_size=32, epochs=n_epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
