In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.sparse import csr_matrix
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D
import re
import string

In [2]:
# Read in data from CSV files
train = pd.read_csv("Dataset/train.csv")
test = pd.read_csv("Dataset/test.csv")

In [3]:
# View snapshot of data
#train.head

In [5]:
# Define NLTK preprocessing functions
nltk.download('wordnet')
nltk.download('omw-1.4')
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Join the tokens back into a string
    preprocessed_text = " ".join(lemmatized_tokens)
    return preprocessed_text

# Apply the preprocessing function to the DataFrame column
train["preprocessed_text"] = train["text"].apply(preprocess_text)
test["preprocessed_text"] = test["text"].apply(preprocess_text)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train["preprocessed_text"], train["label"], test_size=0.2, random_state=42)

# Create a CountVectorizer object with sparse=True
vectorizer = CountVectorizer(binary=True, stop_words='english', max_features=5000)

# Fit the vectorizer on the training data
X_train_counts = vectorizer.fit_transform(X_train)

# Transform the validation data using the fitted vectorizer
X_val_counts = vectorizer.transform(X_val)

# Convert the sparse matrices to dense numpy arrays and add an extra dimension
X_train_array = np.expand_dims(X_train_counts.toarray(), axis=-1)
X_val_array = np.expand_dims(X_val_counts.toarray(), axis=-1)

# Convert the labels to numpy arrays
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\caleb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\caleb\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

[0 0 0 ... 0 0 0]


In [None]:
# Define the input shape
input_shape = X_train_array.shape[1:]

# Define the number of classes
num_classes = len(np.unique(y_train))

# Build the CNN model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=input_shape))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_array, y_train, validation_data=(X_val_array, y_val), epochs=10, batch_size=64)


Epoch 1/10

In [None]:
X_test = test_x
y_test = test['label']
# Reshape the array
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
# Evaluate the CNN model on the testing data
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

In [None]:
print(test_acc)