In [2]:
import pandas as pd
import numpy as np
import regex as re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from datetime import date
import time
now = time.time()

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

**Merge all consolidated CSV files to final**

In [12]:
import glob
df_final_consolidated = pd.concat([pd.read_csv(f) for f in glob.glob('../data/consolidate_data/*.csv')], 
                                  ignore_index = True)
df_final_consolidated.drop(columns = 'Unnamed: 0', inplace = True)
# df_final_consolidated.to_csv('../data/'+'df_final_consolidated'+'.csv' ,index = False, sep = ",") 

In [13]:
df_final_consolidated['yes_disaster'].unique()

array([0, 1])

In [15]:
df_final_consolidated.head()

Unnamed: 0,author,content,description,publishedAt,source,source_id,source_name,title,types,url,urlToImage,yes_disaster
0,A.A. Newton,Large portions of the southeastern United Stat...,Large portions of the southeastern United Stat...,2018-10-09T16:30:00Z,"{'id': None, 'name': 'Lifehacker.com'}",,Lifehacker.com,How to Track Hurricane Michael,landfall,https://lifehacker.com/how-to-track-hurricane-...,https://i.kinja-img.com/gawker-media/image/upl...,0
1,"Brian Kahn on Earther, shared by Tom McKay to ...",Hurricane Michael is on track to make landfall...,Hurricane Michael is on track to make landfall...,2018-10-09T20:10:00Z,"{'id': None, 'name': 'Gizmodo.com'}",,Gizmodo.com,Why Hurricane Michael Could Bring 13 Feet of S...,landfall,https://earther.gizmodo.com/why-hurricane-mich...,https://i.kinja-img.com/gawker-media/image/upl...,0
2,Tom McKay,"On Tuesday, Google announced yet another massi...","On Tuesday, Google announced yet another massi...",2018-07-17T23:27:00Z,"{'id': None, 'name': 'Gizmodo.com'}",,Gizmodo.com,Google Announces Plan to Lay Massive Subsea Ca...,landfall,https://gizmodo.com/google-announces-plan-to-l...,https://i.kinja-img.com/gawker-media/image/upl...,0
3,https://www.facebook.com/bbcnews,A selection of photos from across Africa and A...,A selection of the week's best photos from acr...,2019-03-29T00:58:15Z,"{'id': 'bbc-news', 'name': 'BBC News'}",bbc-news,BBC News,Africa's week in pictures: 22-28 March 2019,landfall,https://www.bbc.co.uk/news/world-africa-47734545,https://ichef.bbci.co.uk/news/1024/branded_new...,0
4,Chloe Bryan,Hurricane Michael made landfall in the Florida...,Hurricane Michael made landfall in the Florida...,2018-10-11T15:35:08Z,"{'id': 'mashable', 'name': 'Mashable'}",mashable,Mashable,Photos show widespread destruction in the afte...,landfall,https://mashable.com/article/hurricane-michael...,https://i.amz.mshcdn.com/VPO6iYdrwgOdpUY9wW3TF...,0


In [16]:
df_final_consolidated['yes_disaster'].value_counts()

0    17504
1     8572
Name: yes_disaster, dtype: int64

In [17]:
df_final_consolidated.drop_duplicates(subset=['content','description'],keep='last').shape

(18292, 12)

In [19]:
df_final_consolidated.description[1]

'Hurricane Michael is on track to make landfall in Florida’s Panhandle Wednesday as one of the fiercest storms the region has ever seen. While Florence drenched the Carolinas with record rainfall and other hurricanes attack with wind, Michael’s main threat wil…'

Note: Since the classes are unbalanced, we will need a function to produce the balanced class. 

In [23]:
def balance_class(df): 
    '''
    Balance class in the dataframe . 
    1. Take dataframe and break it down into two parts, one with 1s only, the other with 0s only 
    2. Randomly select the same number of 1's from 0s dataframe without replacement 
    3. combine the 1s dataframe with the randomly selected 0s dataframe to create a balanced dataframe 
    4. return a balanced class dataframe
    '''
    #two dataframes 
    df_1 = df[df['yes_disaster'] == 1]  #dataframe with 1's only 
    df_0 = df[df['yes_disaster'] == 0]  #dataframe with 0's only 
    
    #number of 1's 
    num = df_1.shape[0]
    
    #randomly generate 0's from df_0 
    df_rd_0 = df_0.sample(n = num, replace = False)
    
    #balanced class df
    new_df = pd.concat([df_1, df_rd_0], ignore_index = True)
    
    return new_df

In [27]:
#testing function 
test_df = balance_class(df_final_consolidated)
test_df['yes_disaster'].value_counts()

1    8572
0    8572
Name: yes_disaster, dtype: int64

In [54]:
def tokenizer_lemmatizer (text): 
    '''
    Initializing tokenizer and lemmatizer to handle NLP preprocessing. 
    1. breakdown the word by alphanumeric characters and dollar with number
    2. Create a list that appended with lemmatized posts and rejoin words by one string 
       alongside removing characters and numbers
    '''
    
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
    tokens = [tokenizer.tokenize(str(post).lower()) for post in (df[text])]
    
    
    lemmatizer = WordNetLemmatizer()
    lems = []
    for post in tokens:
        tok_post = []
        for word in post:
            tok_post.append(re.sub("[^a-zA-Z]", "", lemmatizer.lemmatize(word))) #Remove non-letter
        posts = " ".join(tok_post)
        lems.append(posts)
    
    words_not_used = ['wa', 've', 'ha', 'don']
    
    lems = [w for w in lems if not w in words_not_used] #stopwords.words('english')
    
    df[text] = lems #overwrite the df
    
    print (f'tokenizer processed: {len(tokens)}')
    print (f'lemmatizer processed: {len(lems)}')
    #return lems

In [55]:
tokenizer_lemmatizer('content')
tokenizer_lemmatizer('description')
tokenizer_lemmatizer('title')

tokenizer processed: 8047
lemmatizer processed: 8047


In [13]:
df.description[1]

'ied blast in kashmir km from pulwama terror attack spot no casualty hindustan time an improvised explosive device ied went off in a jammu and kashmir village in awantipora on saturday barely  km from the february  pulwama attack view full coverag'

**Checkpoint**: save to work-in-progress CSV file

In [56]:
# df.to_csv('../data/'+'tokenizer_lemmatizer'+'.csv' ,index = False, sep = ",") 