# Natural Language Processing with Disaster Tweets
## Predict which Tweets are about real disasters and which ones are not

#### by Abhiram Rishi Prattipati

Reference: 
https://spotintelligence.com/2023/02/22/logistic-regression-text-classification-python/

In [1]:
# Necessary libraries and tools

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import re

import numpy as np 
import pandas as pd 

from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



In [2]:
train_dataframe = pd.read_csv("nlp_data/train.csv")
test_dataframe = pd.read_csv("nlp_data/test.csv")

In [3]:
train_dataframe.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test_dataframe.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Part 1 - With TF-IDF and Logistic Regression

In [5]:
tweettoken = TweetTokenizer(strip_handles=True, reduce_len=True)
lemmatizer=WordNetLemmatizer()
stemmer=PorterStemmer()

In [6]:
def text_preprocess(text):
    # substitutes characters that are not alphaphetic to space
    text =re.sub('[^a-zA-Z]'," ",text)
    # converts to lowercase
    text = text.lower()
    #tokenizes the tweet
    res=tweettoken.tokenize(text)
    #removes stopwords
    for i in res:
        if i in stopwords.words('english'):
            res.remove(i)
    rest=[]
    # lemmatizes each word in res
    for k in res:
        rest.append(lemmatizer.lemmatize(k))
    # converts the list into string and spaces each element in list
    ret=" ".join(rest)
    
    return ret

In [7]:
train_dataframe['processed_text'] = train_dataframe['text'].map(text_preprocess)

In [8]:

train_dataframe.head()

Unnamed: 0,id,keyword,location,text,target,processed_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed the reason this earthquake may allah forg...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident asked shelter place being notified of...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfire pour...


In [9]:
x = np.array(train_dataframe['processed_text'])
y = np.array(train_dataframe['target'])

(x_train,x_test,y_train,y_test) = train_test_split(x, y, train_size=0.75, random_state=42)

## TF-IDF

In [10]:
tf_idf = TfidfVectorizer(max_features=4000)

In [11]:
X_train_tfidf_vector = tf_idf.fit_transform(x_train)
X_test_tfidf_vector = tf_idf.transform(x_test)

## Logistic Regression

In [12]:
logReg = LogisticRegression(random_state=42, max_iter=1000)

In [13]:
model = logReg.fit(X_train_tfidf_vector, y_train)

In [14]:
y_pred = model.predict(X_test_tfidf_vector)
y_pred

array([0, 0, 1, ..., 1, 0, 1])

In [15]:
# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8135504201680672


# Submission

In [16]:
submission_dataframe = pd.read_csv("nlp_data/sample_submission.csv")

In [17]:
test_dataframe['processed_text'] = test_dataframe['text'].map(text_preprocess)

In [18]:
x_vect_test = tf_idf.transform(np.array(test_dataframe['processed_text']))

In [19]:
submission_dataframe["target"] = model.predict(x_vect_test)

In [20]:
submission_dataframe.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [21]:
submission_dataframe.to_csv("submission.csv", index=False)