In [1]:
#Importing useful Libraries
import numpy as np
import pandas as pd

In [2]:
#Loading test and train Data
data = pd.read_csv(r"/Users/alekhdixit/DS/Projects/NLP/Disaster/disaster_tweets.csv")
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
data.shape

(7613, 5)

In [4]:
data['keyword'].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane%20accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew%20up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown%20up', 'body%20bag', 'body%20bagging',
       'body%20bags', 'bomb', 'bombed', 'bombing', 'bridge%20collapse',
       'buildings%20burning', 'buildings%20on%20fire', 'burned',
       'burning', 'burning%20buildings', 'bush%20fires', 'casualties',
       'casualty', 'catastrophe', 'catastrophic', 'chemical%20emergency',
       'cliff%20fall', 'collapse', 'collapsed', 'collide', 'collided',
       'collision', 'crash', 'crashed', 'crush', 'crushed', 'curfew',
       'cyclone', 'damage', 'danger', 'dead', 'death', 'deaths', 'debris',
       'deluge', 'deluged', 'demolish', 'demolished', 'demolition',
       'derail', 'der

# Data Cleaning

In [5]:
#Checking for Null Values
data.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [6]:
data['location'].nunique()

3341

In [7]:
#Drop the Location Column as It is not much useful in the analysis
data.drop(['location'],axis=1,inplace=True)
data.head()

Unnamed: 0,id,keyword,text,target
0,1,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,Forest fire near La Ronge Sask. Canada,1
2,5,,All residents asked to 'shelter in place' are ...,1
3,6,,"13,000 people receive #wildfires evacuation or...",1
4,7,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
data.isnull().sum()

id          0
keyword    61
text        0
target      0
dtype: int64

In [9]:
#Drop nan values from Keyword column
data.dropna(inplace=True)

In [10]:
data.isnull().sum()

id         0
keyword    0
text       0
target     0
dtype: int64

In [11]:
data['target'].value_counts()

0    4323
1    3229
Name: target, dtype: int64

In [12]:
data['keyword'].nunique()

221

# Removing special character and convert all to lower case

In [13]:
import re
data['text']=[re.sub(r'[^a-z]+',' ', i.lower()).strip() for i in data['text']]
data.head()

Unnamed: 0,id,keyword,text,target
31,48,ablaze,bbcmtd wholesale markets ablaze http t co lhyx...,1
32,49,ablaze,we always try to bring the heavy metal rt http...,0
33,50,ablaze,africanbaze breaking news nigeria flag set abl...,1
34,52,ablaze,crying out for more set me ablaze,0
35,53,ablaze,on plus side look at the sky last night it was...,0


In [14]:
data['text'][45]

'i gained followers in the last week you know your stats and grow with http t co tiyulif c'

# Defining X_train and y_train

In [15]:
X=data['text']
y=data['target']

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 43)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = 'english', max_features = 1000, ngram_range = (3,3))

X_train_counts = cv.fit_transform(X_train)
X_train_counts.shape

(5059, 1000)

# Logistic Regression with CountVectorizer

In [18]:
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
text_clf = Pipeline([('cv', CountVectorizer(stop_words = 'english', max_features = 1000, ngram_range = (3,3))),
                     ('LReg', LogisticRegression()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(steps=[('cv',
                 CountVectorizer(max_features=1000, ngram_range=(3, 3),
                                 stop_words='english')),
                ('LReg', LogisticRegression())])

In [19]:
# Form a prediction set
predictions = text_clf.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1422   15]
 [ 833  223]]


In [20]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.63      0.99      0.77      1437
           1       0.94      0.21      0.34      1056

    accuracy                           0.66      2493
   macro avg       0.78      0.60      0.56      2493
weighted avg       0.76      0.66      0.59      2493



In [21]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.6598475732049739


# Spliting data into test and train

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 43)

# Logistic Regression Model with TfidfVectorizer

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('LReg', LogisticRegression()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('LReg', LogisticRegression())])

## Test the classifier and display results



In [24]:
# Form a prediction set
predictions = text_clf.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1247  190]
 [ 309  747]]


In [25]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83      1437
           1       0.80      0.71      0.75      1056

    accuracy                           0.80      2493
   macro avg       0.80      0.79      0.79      2493
weighted avg       0.80      0.80      0.80      2493



In [26]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.7998395507420778


# Linear SVC Model

In [27]:
#from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [28]:
# Form a prediction set
predictions = text_clf.predict(X_test)

# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[1189  248]
 [ 272  784]]


In [29]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82      1437
           1       0.76      0.74      0.75      1056

    accuracy                           0.79      2493
   macro avg       0.79      0.78      0.79      2493
weighted avg       0.79      0.79      0.79      2493



In [30]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.7914159647011633
