## Imports

In [None]:
import os
import zipfile

import pandas as pd
import numpy as np

# natural language processing
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

from textblob import TextBlob  # imported to correct text

# from tokenizers import BertWordPieceTokenizer

# machine learning
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
# from tensorflow.keras.callback import ModelCheckpoint

## Extract Data

In [None]:
# get zipped file name
file = "nlp-getting-started.zip"

# check if file for data exists and create if does not
os.makedirs("data", exist_ok=True)

# unzip file and save to 'data' folder
with zipfile.ZipFile(file, "r") as zip_ref:
    zip_ref.extractall("data")

## Get Data

In [None]:
# get train and test data
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

## Analyze Data

In [None]:
# get train and test shapes
print(f"train shape: {train.shape}")
print(f"test shape: {test.shape}")

In [None]:
# get first 20 rows of training set
train.head(20)

In [None]:
# get column data types
train.dtypes

In [None]:
# check for 'NA'
print(f"train NA:\n\n{pd.isna(train).sum()}\n\n")
print(f"test NA:\n\n{pd.isna(test).sum()}")

In [None]:
# distribution of disasters and non-disasters in training set
print(f"diasters: {(train.target == 1).sum()},\nnon-disasters: {(train.target == 0).sum()}")

## Preprocess Text and create train/test split

In [None]:
# preprocess function
def text_preprocess(doc):
    
    preprocessed_doc = []
    for text in doc:
        
        # make lowercase
        text = text.lower()
        
        # remove urls
        text = tf.strings.regex_replace(text, "(?:https?)?:\/\/t.co\/\w*", " ")
        
        # remove mentions
        text = tf.strings.regex_replace(text, "@\w+", " ")
        
        # correct typos
        text = TextBlob(text.numpy().decode("utf-8")).correct().string
        
        # tokenize by word
        word_tokens = word_tokenize(text)
        
        # remove non-alphabetical characters
        word_tokens = [word for word in word_tokens if word.isalnum()]
        
        # apply stemmer
        stemmer = PorterStemmer()
        word_tokens = [stemmer.stem(word) for word in word_tokens]
        
        # reappend to preprocessed doc
        preprocessed_doc.append(word_tokens)
    
    
    
    return preprocessed_doc


# preprocessed train and test text
preprocessed_train = text_preprocess(train.text)
preprocess_test = text_preprocess(test.text)

In [None]:
# get target values for training set
y = train.target

# split preprocessed train text into intermediate train/test sets (0.8, 0.2)
X_intermediate, X_test, y_intermediate, y_test = train_test_split(preprocessed_train, y, test_size=0.2, random_state=42, shuffle=True)

# split intermediate train set into train/validation set (0.75, 0.25)
X_train, X_val, y_train, y_val = train_test_split(X_intermediate, y_intermediate, test_size=0.25, random_state=42, shuffle=True)