# Natural Language Processing with Disaster Tweets
## Predict which Tweets are about real disasters and which ones are not

#### by Abhiram Rishi Prattipati

In [1]:
# Necessary libraries and tools

import re

import numpy as np 
import pandas as pd 

from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
import xgboost
from sklearn.model_selection import RandomizedSearchCV

In [2]:
train_dataframe = pd.read_csv("nlp_data/train.csv")
test_dataframe = pd.read_csv("nlp_data/test.csv")

In [3]:
train_dataframe.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test_dataframe.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
tweettoken = TweetTokenizer(strip_handles=True, reduce_len=True)
lemmatizer=WordNetLemmatizer()
stemmer=PorterStemmer()

In [6]:
# Process the text

def preprocess(t):
    # substitutes characters that are not alphaphetic to space
    tee=re.sub('[^a-zA-Z]'," ",t)
    # converts to lowercase
    tee=tee.lower()
    #tokenizes the tweet
    res=tweettoken.tokenize(tee)
    #removes stopwords
    for i in res:
        if i in stopwords.words('english'):
            res.remove(i)
    rest=[]
    # lemmatizes each word in res
    for k in res:
        rest.append(lemmatizer.lemmatize(k))
    # converts the list into string and spaces each element in list
    ret=" ".join(rest)
    
    return ret
    
    

In [7]:
train_dataframe['processed_text'] = train_dataframe['text'].map(preprocess)

In [8]:
train_dataframe.head()

Unnamed: 0,id,keyword,location,text,target,processed_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed the reason this earthquake may allah forg...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident asked shelter place being notified of...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfire pour...


### TFIDF 

In [16]:
X = np.array(train_dataframe['processed_text'])
Y = np.array(train_dataframe['target'])

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75, random_state = 42)

In [17]:
tf_idf = TfidfVectorizer(max_features=4000)

In [18]:
X

array(['deed the reason this earthquake may allah forgive u',
       'forest fire near la ronge sask canada',
       'resident asked shelter place being notified officer other evacuation shelter place order expected',
       ..., 'utc km of volcano hawaii http co zdtoyd ebj',
       'police investigating an e bike collided a car little portugal e bike rider suffered serious non life threatening injury',
       'latest home razed northern california wildfire abc news http co ymy rskq'],
      dtype=object)

In [19]:
X_train

array(['dicehateme puppyshogun make sense paper beat rock paper come wood wood be able support obliterate rock',
       'catoinstitute cause federal failure deeply structural they not easily solved http co h xcax jbu',
       'well i chaning ipad screen fucking exploded glass went over place look like job going need new one',
       ...,
       'omron hem c automatic blood pressure monitor standard large bp cuff http co gjbainqwn http co jphgpl c x',
       'official say quarantine in place an alabama home a possible ebola case developing symptom http co rqkk uhey',
       'moved england five year ago today a whirlwind time ha http co easlgea b'],
      dtype=object)

In [20]:
len(X)

7613

In [21]:
len(X_train)

5709

In [22]:
tf_idf.fit_transform(X_train)

<5709x4000 sparse matrix of type '<class 'numpy.float64'>'
	with 52935 stored elements in Compressed Sparse Row format>

In [23]:
tf_idf.transform(X_test)

<1904x4000 sparse matrix of type '<class 'numpy.float64'>'
	with 16557 stored elements in Compressed Sparse Row format>