In [1]:
####################################
#         IMPORT LIBRARIES         #
####################################

import numpy as np      # linear algebra
import pandas as pd     # data processing, CSV file I/O (e.g. pd.read_csv)
import os               # Handle file paths
import string           # String functionality
import re               # Regex
import nltk             # Corpus for stopwords, lemmatizing, etc.
import shapefile as shp # Reading shape files for geography

# Sklearn TF-IDF vectors
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\benrc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\benrc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
####################################
#              CLASSES             #
####################################

# Input data files are available in the "../input/" directory.
# Any results you write to the current directory are saved as output.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory.

class DisasterTweets:

    def __init__(self):

        for dirname, _, filenames in os.walk('/kaggle/input'):
            # Save CSVs
            if len(filenames):
                self.train_set  = pd.read_csv(os.path.join(dirname, filenames[0]))
                self.test_set   = pd.read_csv(os.path.join(dirname, filenames[1]))


    def __init__(self, trainFile, testFile):
        self.train_set  = pd.read_csv(trainFile)
        self.test_set   = pd.read_csv(testFile)


    def has_empty_data(self):
        return not (len(self.train_set) and len(self.test_set))  # Returns true if any dataset is empty

In [None]:
####################################
#         HELPER FUNCTIONS         #
####################################

# Discards all punctuation as classified by string.punctuation
def remove_punctuation(text):
    return "".join([char for char in text if char not in string.punctuation])


# A URL is defined as a string that begins with http or https followed by
# :// and then any set of characters up to a whitespace
def remove_urls(text):
    return re.sub(r'http?:\/\/\S+\s?', '', text)


# Remove non-standard ASCII characters below char value 128
def remove_bad_ascii(text):
    return "".join(i for i in text if ord(i)<128)


# Removing numbers from the text, might not actually be a good idea.
# Take this into consideration
def remove_numbers(text):
    return "".join([char for char in text if not char.isdigit()])


def collect_hashtags(text):
    return re.findall(r'#(\S*)\s?', text)


def remove_hashtags(text):
    return re.sub(r'#\S*\s?', '', text)


# Run the various sanitization functions
def sanitize_text(text):

    clean_text = remove_punctuation(text)
    clean_text = remove_urls(clean_text)
    clean_text = remove_bad_ascii(clean_text)
    clean_text = remove_hashtags(clean_text)
    clean_text = remove_numbers(clean_text)

    return clean_text


def collect_locations(countries_shp, spl_shp):

    c_shp = shp.Reader(countries_shp)
    s_shp = shp.Reader(spl_shp)

    c_list = [row[3] for row in c_shp.records()] # Countries
    s_list = [row[8] for row in s_shp.records()] # States
    a_list = [row[9][:2] for row in s_shp.records()] # State acronyms

    return c_list + s_list + a_list # Compile to one big list


# Collect all the locations stated and cross reference in the shape file,
# looking for real-life places
def bin_locations(location, location_set):

    if type(location) == str and location: # Ensure it is a string, AKA not NaN

        # Ensure the string contains alphabetical characters of at least length 2
        match = re.search("[a-zA-Z]{2,}", location)
        if match:
            location_split = ' '.join(re.split(',|\s', location.lower())).split() # Split on commas and whitespace

            for l in location_set:
                if l.lower() in location_split:
                    return location # Return the base location, as it appears real

    return ''


# Separates text into an array of tokenized strings
def tokenize(text):
    return re.split('\W+', text)  # Could add another input to specifiy a parse string


# Removes Enlgish stopwords from a tokenized list
def remove_stopwords(token_text):
    stopword = nltk.corpus.stopwords.words('english')  # All English stopwords
    return [word for word in token_text if word not in stopword and word]


#  Lemmatize a tokenized list
def lemmatize_tokens(token_text):
    wn = nltk.WordNetLemmatizer()
    return [wn.lemmatize(word) for word in token_text]


def compute_tf_idf(tweets):

    # Dummy function to trick sklearn into taking token list
    def dummy_func(doc):
        return doc

    vectorizer = TfidfVectorizer(
                analyzer='word',
                tokenizer=dummy_func,
                preprocessor=dummy_func,
                token_pattern=None
    )

    vectors = vectorizer.fit_transform(tweets)
    feature_names = vectorizer.get_feature_names()

    dense = vectors.todense()
    denselist = dense.tolist()

    # Map TF-IDF results to dictionary
    tf_idf_list = []
    for tweetList in denselist:
        tf_idf_dict = dict.fromkeys(feature_names, 0)
        for i in range(0, len(feature_names)):
            tf_idf_dict[feature_names[i]] = tweetList[i]
        tf_idf_list.append(tf_idf_dict)

    return pd.Series(data=tf_idf_list)

In [None]:
####################################
#          GENERATE DATA           #
####################################
disaster_tweets = DisasterTweets('data/train.csv', 'data/test.csv')

if disaster_tweets.has_empty_data():
    raise ValueError('Could not read data CSVs')

In [None]:
####################################
#        PREPROCESSING DATA        #
####################################

# Collect hashtags before cleaning anything
disaster_tweets.test_set['hashtags'] = disaster_tweets.test_set['text'].map(collect_hashtags)

# Sanitize the data
disaster_tweets.test_set['text'] = disaster_tweets.test_set['text'].map(sanitize_text)

# Bin locations
location_data = collect_locations('data/Countries/ne_110m_admin_0_countries.shp', 'data/States and Provinces/ne_110m_admin_1_states_provinces_lakes.shp')
disaster_tweets.test_set['true_location'] = disaster_tweets.test_set['location'].apply(bin_locations, args=(location_data,))

# Tokenize
disaster_tweets.test_set['tokenized'] = disaster_tweets.test_set['text'].map(tokenize)

# Remove stop words
# Removing words that don't give us valuable information, such as 'the, a, of'
disaster_tweets.test_set['tokenized'] = disaster_tweets.test_set['tokenized'].map(remove_stopwords)

# Lemmatizing
# A more extreme case of stemming. Uses dictionary look ups to find the proper root word.
# For example, 'Entitling' would give us 'Entitl' from stemming, but lemmatizing would
# give us 'Entitle'.
disaster_tweets.test_set['tokenized'] = disaster_tweets.test_set['tokenized'].map(lemmatize_tokens)

# Generate term frequency-inverse data frequency (TF-IDF)
disaster_tweets.test_set['tf_idf'] = compute_tf_idf(disaster_tweets.test_set['tokenized'])

# Use N-grams to find relationships between words in sentences

# Use Support Vector Machine?

# After training model, we need to do a regularization