Importing important libraries


In [249]:

import pandas as pd #for taking data from csv file
import nltk
from nltk.corpus import stopwords #for removing stopwords
from nltk.stem import WordNetLemmatizer #for lemmatization
from sklearn.feature_extraction.text import CountVectorizer #for converting text data to numerical data
stops = set(stopwords.words("english")) #set of stopwords
wordnet = WordNetLemmatizer() #object of WordNetLemmatizer
vectorizer =CountVectorizer() #object of TfidfVectorizer
import re #for regular expression

Importing dataset

In [250]:
df = pd.read_csv('train.csv', header = 0 , delimiter = ',')
print(df.head())
print(df.describe())

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  
                 id      target
count   7613.000000  7613.00000
mean    5441.934848     0.42966
std     3137.116090     0.49506
min        1.000000     0.00000
25%     2734.000000     0.00000
50%     5408.000000     0.00000
75%     8146.000000     1.00000
max    10873.000000     1.00000


In [251]:
df.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

Preprocessing only the text column of dataframe

In [252]:
review_text = df['text']
print(review_text[0])
corpus = []
for i in range(len(review_text)):
    review = re.sub(r'https?://\S+|www\.\S+', '', review_text[i])
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in stops]
    review = ' '.join(review)
    corpus.append(review)
corpus

Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all


['deed reason earthquake may allah forgive u',
 'forest fire near la ronge sask canada',
 'resident asked shelter place notified officer evacuation shelter place order expected',
 'people receive wildfire evacuation order california',
 'got sent photo ruby alaska smoke wildfire pours school',
 'rockyfire update california hwy closed direction due lake county fire cafire wildfire',
 'flood disaster heavy rain cause flash flooding street manitou colorado spring area',
 'top hill see fire wood',
 'emergency evacuation happening building across street',
 'afraid tornado coming area',
 'three people died heat wave far',
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding',
 'raining flooding florida tampabay tampa day lost count',
 'flood bago myanmar arrived bago',
 'damage school bus multi car crash breaking',
 'man',
 'love fruit',
 'summer lovely',
 'car fast',
 'goooooooaaaaaal',
 'ridiculous',
 'london cool',
 'love skiing',
 'wonderful day',


Splitting the dataset :

In [253]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, df['target'], test_size = 0.2, random_state = 42)


X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

Import the ML Algorithm

In [254]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred

array([0, 0, 1, ..., 1, 1, 0], dtype=int64)

Check Accuracy of model

In [255]:
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


0.7944845699277742
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       874
           1       0.78      0.73      0.75       649

    accuracy                           0.79      1523
   macro avg       0.79      0.79      0.79      1523
weighted avg       0.79      0.79      0.79      1523



Exporting model and vocabulary of dataset as a file

In [256]:
import joblib
joblib.dump(model, 'Disaster_tweet_model.pkl')
joblib.dump(vectorizer, 'Disaster_tweet_vocabulary.pkl')

['Disaster_tweet_vocabulary.pkl']