Machine learning pipeline (NLP)
1) preprocessing [X]
2) feature engineering []
3) feature selection []
4) vectorization []
5) dataset splitting []
6) model training | model selection []
7) hyper param tuning []

In [28]:
#Imports and declarations
import pandas as pd
import os
import re
import nltk
import string
from tqdm import tqdm
from tqdm._tqdm_notebook import tqdm_notebook


#This is to enable a fancy user display for .progress_apply()
tqdm_notebook.pandas()


In [40]:
#use this to check the current working directory to do relative links
#print(os.getcwd())

#increase the width of the pandas dataframe display
pd.set_option('display.width', 2000)

#Reading of the CSV file
train_csv_link = "./datasets/train.csv"
test_csv_link = "./datasets/test.csv"

train_csv = pd.read_csv(train_csv_link)
test_csv = pd.read_csv(test_csv_link)

#get the column names 
train_col_index = [index for index in train_csv.columns]
test_col_index = [index for index in test_csv.columns]

#understand the training dataset
print("The training dataset has {} rows and {} columns".format(len(train_csv) , len(train_csv.columns)))
for index in train_col_index:
    print("The number of missing data in {} : {}".format(index,train_csv[index].isnull().sum()))


The training dataset has 7613 rows and 5 columns
The number of missing data in id : 0
The number of missing data in keyword : 61
The number of missing data in location : 2533
The number of missing data in text : 0
The number of missing data in target : 0


In [30]:
#cleaning part of the pipeline (preprocessing)
#we will clean the text labels

#define a function to parse the text into
def remove_punctuation(text):
    clean_text = "".join([char for char in text if char not in string.punctuation])
    return clean_text
#tokenize clean_text
def tokenize_text (text):
    #\W+ refers to the regex for non charcter words [^a-zA-Z0-9_]
    tokens = re.split("\W+",text)
    return tokens
#remove stopwords
def rm_stopwords(text_list):
    stopwords = nltk.corpus.stopwords.words("english")
    clean_list = [word for word in text_list if word not in stopwords]
    return clean_list
#stemming of text_token
def stem_text(text_list):
    #we use porter stemmer for now
    ps = nltk.PorterStemmer()
    stem_list = [ps.stem(word) for word in text_list]
    return stem_list

#produce clean text without punctuations
train_csv["clean_text"] = train_csv["text"].apply(lambda x : remove_punctuation(x))
train_csv["text_token"] = train_csv["clean_text"].apply(lambda x : tokenize_text(x))
train_csv["text_token"] = train_csv["text_token"].apply(lambda x : rm_stopwords(x))
train_csv["text_token"] = train_csv["text_token"].apply(lambda x : stem_text(x))



train_csv.head()

Unnamed: 0,id,keyword,location,text,target,clean_text,text_token
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake Ma...,"[our, deed, reason, earthquak, may, allah, for..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[forest, fire, near, La, rong, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are be...,"[all, resid, ask, shelter, place, notifi, offi..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, peopl, receiv, wildfir, evacu, order, ..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as s...,"[just, got, sent, photo, rubi, alaska, smoke, ..."


feature engineering ideas :
1) number of punctuations [X]
2) number of capital letters [X]
3) presence of links [X]
4) number of words
5) number of hashtags
6) number of weather words / natural disaster / life threat words
7) number of country words
8) number of mispelled words

In [46]:
#count number of punctuations
def count_punc (text):
    count = 0
    for char in text:
        if char in string.punctuation:
            count += 1
    return count

#count the number of capital letters
def count_cap (text):
    count = 0
    for char in text:
        if char.isupper() is True:
            count += 1
    return count

#boolean of links
def link_bool(text):
    if re.search("http[s]?\:\/\/",text) not in [None,"None"]:
        return 1
    return 0


train_csv["punc_count"] = train_csv["text"].apply(lambda x : count_punc(x))
train_csv["capital_count"] = train_csv["text"].apply(lambda x : count_cap(x))
train_csv["link_count"] = train_csv["text"].apply(lambda x : link_bool(x))
train_csv[["text","link_count"]]




Unnamed: 0,text,link_count
0,Our Deeds are the Reason of this #earthquake M...,0
1,Forest fire near La Ronge Sask. Canada,0
2,All residents asked to 'shelter in place' are ...,0
3,"13,000 people receive #wildfires evacuation or...",0
4,Just got sent this photo from Ruby #Alaska as ...,0
5,#RockyFire Update => California Hwy. 20 closed...,0
6,#flood #disaster Heavy rain causes flash flood...,0
7,I'm on top of the hill and I can see a fire in...,0
8,There's an emergency evacuation happening now ...,0
9,I'm afraid that the tornado is coming to our a...,0
