In [221]:
import pandas as pd
from tqdm import tqdm
import sqlalchemy
from pandarallel import pandarallel
import multiprocessing

from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

import warnings

warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/arielbosano/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/arielbosano/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/arielbosano/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/arielbosano/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [271]:
def load_data(database_filepath):
    """
    Returns both target and feature vectors from the messages database

    Args:
        database_filepath::str
            Database filepath to retrieve the data using SQLite

    Returns:
        X::pd.Series
            Pandas series containing the feature vector

        y::pd.DataFrame
            Dataframe with all the target values for all the observations

        categories::[str]
            List with all category names from the given dataset

    """
    engine = sqlalchemy.create_engine('sqlite:///'  + database_filepath)
    data_frame = pd.read_sql('SELECT * FROM database_messages', engine)

    return data_frame[['message']], data_frame.iloc[:, 3:]

def tokenize(text):
    """
    Performs a tokenization to the provided text to that feature texts can be seen as vectors
    to our Machine Learning pipeline.

    Args:
        text:str
            message string to split

    Returns
        vector:[str]
            array with tokens
    """

    # Replace url hyperlinks with special place holders
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    urls = re.findall(url_regex, text)

    # Crawl for each found URL and replace in the original text
    for url in urls:
        text = text.replace(url, 'url_placeholder')

    # Divide sentences into words (splitted by space)
    tokens = nltk.word_tokenize(text)

    # Lemmatize all the tokens to obtain the word's stem or root representation
    lem = nltk.stem.WordNetLemmatizer()
    clean_tokens = [lem.lemmatize(token).lower().strip() for token in tokens]
    clean_tokens = list(filter(lambda x: len(x) > 2, clean_tokens))  # Filter tokens with less than 2 characters

    return clean_tokens

In [272]:
X, y = load_data('DisasterResponse.db')

In [285]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [286]:
vec = CountVectorizer(tokenizer=tokenize)

In [290]:
X_clean = TfidfTransformer().fit_transform(vec.fit_transform(X['message']))

In [293]:
d = X_clean.toarray()