# Text Classification: Natural Language Processing with Disaster Tweets

## Downloading the Dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

In [2]:
## read csv data using pandas
train_df = pd.read_csv('train.csv',            # make sure to put the file in the working directory
                 sep=',',                      # separater
                 header = 0,                   # the header that indicates the column names is at row 0
                 usecols=['id','keyword', 'location', 'text', 'target'])

# print out the dataframe
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


## Basic Text Preprocessing

We'll first create text_target_train_data and text_target_test_data so that we have versions of the training and testing datasets that only have the text, and not the id, keyword, or location.

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
drop_cols = ['id', 'keyword', 'location']

text_target_train_data = train_df.drop(drop_cols, axis = 1)

In [5]:
text_target_train_data

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...
7608,Two giant cranes holding a bridge collapse int...,1
7609,@aria_ahrary @TheTawniest The out of control w...,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,Police investigating after an e-bike collided ...,1


Now, we use the nltk (tokenizer) for removing stopwords, stemming, and deleting other common Twitter-specific elements (such as user tags and links).

In [6]:
import nltk   # tokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

import string
import re   # regular expressions

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
stopwords = stopwords.words('english')
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'\w+')

# Given Twitter data, we also want to delete user tags and links
tags = r'@\w*'
html = r'<.*?>'
urls = r'https?://\S+|www\.\S+'
non_ascii_pattern = r'[^\x00-\x7F]'

In [8]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    return text

In [9]:
def preprocess_text(sentence, stem=False):
    s = re.sub(tags, "", sentence)
    s = re.sub(html, "", s)
    s = re.sub(urls, "", s)
    s = [re.sub(non_ascii_pattern, "", s)]

    sentence_cleaned = []

    sentence = s[0]
    punctuation_removed = remove_punctuation(sentence)

    words = punctuation_removed.lower().split()

    for word in words:
        if word not in stopwords and not any(char.isdigit() for char in word):
            if stem:
                sentence_cleaned.append(stemmer.stem(word).lower())
            else:
                sentence_cleaned.append(word.lower())

    return tokenizer.tokenize(" ".join(sentence_cleaned))

In [10]:
print(f"Orignal: {text_target_train_data.text[7609]}")
print(f"Orignal: {text_target_train_data.target[7609]}")    # a disaster tweet
print()
print(f"Preprocessed: {preprocess_text(text_target_train_data.text[7609])}")

Orignal: @aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.
Orignal: 1

Preprocessed: ['control', 'wild', 'fires', 'california', 'even', 'northern', 'part', 'state', 'troubling']


In [11]:
print(f"Orignal: {text_target_train_data.text[569]}")
print(f"Orignal: {text_target_train_data.target[569]}")    # not a disaster tweet
print()
print(f"Preprocessed: {preprocess_text(text_target_train_data.text[569])}")

Orignal: STAR WARS POWER OF THE JEDI COLLECTION 1 BATTLE DROID HASBRO - Full read by eBay http://t.co/yI30ZgiZsW http://t.co/2jGVhw7YZs
Orignal: 0

Preprocessed: ['star', 'wars', 'power', 'jedi', 'collection', 'battle', 'droid', 'hasbro', 'full', 'read', 'ebay']


In [12]:
text_target_train_data.text = text_target_train_data.text.map(preprocess_text)

In [13]:
text_target_train_data.head()

Unnamed: 0,text,target
0,"[deeds, reason, earthquake, may, allah, forgiv...",1
1,"[forest, fire, near, la, ronge, sask, canada]",1
2,"[residents, asked, shelter, place, notified, o...",1
3,"[people, receive, wildfires, evacuation, order...",1
4,"[got, sent, photo, ruby, alaska, smoke, wildfi...",1


## Training Conventional Models

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [15]:
# Extract text data from the 'text' column
text_data_train = text_target_train_data['text'].apply(lambda x: ' '.join(x))

# Convert text data to a list of strings
text_only_train_data = text_data_train.tolist()

# Fit CountVectorizer on the training data
vectorizer = CountVectorizer(max_features=2150)    # size of vocabulary fine-tuned
vectorizer.fit(text_only_train_data)

In [16]:
# Get the vocabulary learned by CountVectorizer
vocabulary = vectorizer.get_feature_names_out()

# Print the vocabulary
print(vocabulary)

['aba' 'abandoned' 'abc' ... 'yyc' 'zombie' 'zone']


Since the testing set doesn't have target labels (which may make it difficult to evaluate the model's performance, let us split the training set into a training and validation set.

In [17]:
# Transform text data using the vectorizer
X_train_bow = vectorizer.transform(text_only_train_data)
y_train = text_target_train_data['target']

X_train, X_val, y_train, y_val = train_test_split(X_train_bow, y_train, test_size=0.2, random_state=42)

print(X_train.shape)
print(len(y_train))

(6090, 2150)
6090


## Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [19]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_train_pred = logreg.predict(X_train)
y_val_pred = logreg.predict(X_val)

# Evaluate the performance
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90      3468
           1       0.91      0.81      0.86      2622

    accuracy                           0.89      6090
   macro avg       0.89      0.88      0.88      6090
weighted avg       0.89      0.89      0.88      6090

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       874
           1       0.79      0.71      0.75       649

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



### With L2 Regularization

We can see that the precision, recall, and f1-scores are alright, but not very high. Note that the size of the Bag of Words representation vector increases with the size of the vocabulary! In other words, all words will be 14824 dimensional, which means that sparsity may be a problem. Sparse data can lead to the model overfitting, because the model learns patterns specific to the training data and fails to generalize to our unseen (validation) data.

We can try to tune our selection of hyperparameter alpha to see if our model improves. We take 6 alpha values and examine which value for alpha results in the best model performance.

In [20]:
regularization_strengths = [0.01, 0.1, 0.25, 0.5, 1, 5]

best_score = 0
best_C = None

for C in regularization_strengths:
    logreg_with_l2 = LogisticRegression(max_iter=1000, penalty='l2', C=C)

    logreg_with_l2.fit(X_train, y_train)

    y_val_pred = logreg_with_l2.predict(X_val)

    score = logreg_with_l2.score(X_val, y_val)
    print(f"C={C}, Validation Accuracy: {score}")

    if score > best_score:
        best_score = score
        best_C = C

print(f"Best C: {best_C}, Best Validation Accuracy: {best_score}")

C=0.01, Validation Accuracy: 0.7235718975705844
C=0.1, Validation Accuracy: 0.8049901510177282
C=0.25, Validation Accuracy: 0.8049901510177282
C=0.5, Validation Accuracy: 0.8023637557452397
C=1, Validation Accuracy: 0.7984241628365069
C=5, Validation Accuracy: 0.768220617202889
Best C: 0.1, Best Validation Accuracy: 0.8049901510177282


In [21]:
logreg_with_l2 = LogisticRegression(max_iter=1000, penalty='l2', solver='saga', C=0.1)
logreg_with_l2.fit(X_train, y_train)

y_train_pred = logreg_with_l2.predict(X_train)
y_val_pred = logreg_with_l2.predict(X_val)

# Evaluate the performance
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.81      0.95      0.87      3468
           1       0.91      0.70      0.79      2622

    accuracy                           0.84      6090
   macro avg       0.86      0.82      0.83      6090
weighted avg       0.85      0.84      0.84      6090

              precision    recall  f1-score   support

           0       0.78      0.91      0.84       874
           1       0.85      0.66      0.74       649

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.81      0.80      0.80      1523



### With SVD (for Dimensionality Reduction)

In [22]:
# Convert text data to a list of strings
train_data_for_svd = text_data_train.tolist()

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD

In [24]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data_for_svd)

In [25]:
svd = TruncatedSVD(n_components=500)
normalizer = Normalizer(copy=False)

X_train_svd = svd.fit_transform(tfidf_matrix)
X_train_svd = normalizer.fit_transform(X_train_svd)
y_train_svd = text_target_train_data['target']

In [26]:
print(X_train_svd.shape)
print(len(y_train_svd))

(7613, 500)
7613


In [27]:
X_train2, X_val2, y_train2, y_val2 = train_test_split(X_train_svd, y_train_svd, test_size=0.2, random_state=42)

In [28]:
logreg.fit(X_train2, y_train2)

y_train_pred2 = logreg.predict(X_train2)
y_val_pred2 = logreg.predict(X_val2)

# Evaluate the performance
print(classification_report(y_train2, y_train_pred2))
print(classification_report(y_val2, y_val_pred2))

              precision    recall  f1-score   support

           0       0.81      0.90      0.86      3468
           1       0.85      0.73      0.78      2622

    accuracy                           0.83      6090
   macro avg       0.83      0.81      0.82      6090
weighted avg       0.83      0.83      0.82      6090

              precision    recall  f1-score   support

           0       0.79      0.87      0.83       874
           1       0.79      0.69      0.74       649

    accuracy                           0.79      1523
   macro avg       0.79      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523



### With PCA (for Dimensionality Reduction)

In [29]:
train_data_for_pca = text_data_train.tolist()

In [30]:
from sklearn.decomposition import PCA

In [31]:
# Initialize PCA with desired number of components
pca = PCA(n_components=500)

# Fit the model with data and apply the dimensionality reduction
X_train_pca = pca.fit_transform(tfidf_matrix.toarray())
y_train_pca = text_target_train_data['target']

In [32]:
print(X_train_pca.shape)
print(len(y_train_pca))

(7613, 500)
7613


In [33]:
X_train3, X_val3, y_train3, y_val3 = train_test_split(X_train_pca, y_train_pca, test_size=0.2, random_state=42)

In [34]:
logreg.fit(X_train3, y_train3)

y_train_pred3 = logreg.predict(X_train3)
y_val_pred3 = logreg.predict(X_val3)

# Evaluate the performance
print(classification_report(y_train3, y_train_pred3))
print(classification_report(y_val3, y_val_pred3))

              precision    recall  f1-score   support

           0       0.79      0.93      0.86      3468
           1       0.88      0.67      0.76      2622

    accuracy                           0.82      6090
   macro avg       0.84      0.80      0.81      6090
weighted avg       0.83      0.82      0.82      6090

              precision    recall  f1-score   support

           0       0.77      0.91      0.84       874
           1       0.85      0.64      0.73       649

    accuracy                           0.80      1523
   macro avg       0.81      0.78      0.78      1523
weighted avg       0.81      0.80      0.79      1523



## Naive Bayes Classifier

In [35]:
from sklearn.naive_bayes import MultinomialNB

In [36]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

y_train_pred = naive_bayes.predict(X_train)
y_val_pred = naive_bayes.predict(X_val)

# Evaluate the performance
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      3468
           1       0.86      0.76      0.80      2622

    accuracy                           0.84      6090
   macro avg       0.84      0.83      0.84      6090
weighted avg       0.84      0.84      0.84      6090

              precision    recall  f1-score   support

           0       0.80      0.86      0.83       874
           1       0.79      0.70      0.74       649

    accuracy                           0.79      1523
   macro avg       0.79      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523



## SVM

In [37]:
from sklearn.svm import SVC

We can first try an SVM model with a linear kernel, since it is computationally efficient and often a good starting point. However, from our previous analysis with the logistic regression and Naive Bayes classifiers, we may infer that the data is not linearly separable, and thus the performance may not be very good.

In [38]:
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train, y_train)

y_train_pred = svm_linear.predict(X_train)
y_val_pred = svm_linear.predict(X_val)

# Evaluate the performance
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92      3468
           1       0.93      0.84      0.88      2622

    accuracy                           0.91      6090
   macro avg       0.91      0.90      0.90      6090
weighted avg       0.91      0.91      0.90      6090

              precision    recall  f1-score   support

           0       0.79      0.83      0.81       874
           1       0.75      0.71      0.73       649

    accuracy                           0.78      1523
   macro avg       0.77      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523



Instead, now let us try to use a Radial Basis Function (RBF) kernel. We typically use this kernel when the decision boundary is highly non-linear or not well-defined. Note that the gamma parameter controls the smoothness of the decision boundary. Smaller values of gamma lead to smoother decision boundaries.

In [39]:
svm_rbf = SVC(kernel='rbf', gamma='scale')
svm_rbf.fit(X_train, y_train)

y_train_pred = svm_rbf.predict(X_train)
y_val_pred = svm_rbf.predict(X_val)

# Evaluate the performance
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      3468
           1       0.97      0.86      0.91      2622

    accuracy                           0.93      6090
   macro avg       0.93      0.92      0.92      6090
weighted avg       0.93      0.93      0.92      6090

              precision    recall  f1-score   support

           0       0.79      0.89      0.84       874
           1       0.82      0.68      0.75       649

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



## Decision Trees Classifier

In [40]:
from sklearn import tree

In [41]:
dec_trees = tree.DecisionTreeClassifier()
dec_trees.fit(X_train, y_train)

y_train_pred = dec_trees.predict(X_train)
y_val_pred = dec_trees.predict(X_val)

# Evaluate the performance
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      3468
           1       0.99      0.96      0.98      2622

    accuracy                           0.98      6090
   macro avg       0.98      0.98      0.98      6090
weighted avg       0.98      0.98      0.98      6090

              precision    recall  f1-score   support

           0       0.75      0.72      0.74       874
           1       0.64      0.67      0.66       649

    accuracy                           0.70      1523
   macro avg       0.70      0.70      0.70      1523
weighted avg       0.70      0.70      0.70      1523



## Random Forest Classifier

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

y_train_pred = random_forest.predict(X_train)
y_val_pred = random_forest.predict(X_val)

# Evaluate the performance
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3468
           1       0.98      0.97      0.98      2622

    accuracy                           0.98      6090
   macro avg       0.98      0.98      0.98      6090
weighted avg       0.98      0.98      0.98      6090

              precision    recall  f1-score   support

           0       0.77      0.76      0.77       874
           1       0.68      0.70      0.69       649

    accuracy                           0.73      1523
   macro avg       0.73      0.73      0.73      1523
weighted avg       0.74      0.73      0.74      1523



## Gradient Boosting Classifier

In [44]:
from sklearn.ensemble import GradientBoostingClassifier

In [45]:
grad_boost = GradientBoostingClassifier()
grad_boost.fit(X_train, y_train)

y_train_pred = grad_boost.predict(X_train)
y_val_pred = grad_boost.predict(X_val)

# Evaluate the performance
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.72      0.97      0.83      3468
           1       0.92      0.50      0.65      2622

    accuracy                           0.77      6090
   macro avg       0.82      0.74      0.74      6090
weighted avg       0.81      0.77      0.75      6090

              precision    recall  f1-score   support

           0       0.71      0.93      0.80       874
           1       0.84      0.48      0.61       649

    accuracy                           0.74      1523
   macro avg       0.77      0.71      0.71      1523
weighted avg       0.76      0.74      0.72      1523



In [46]:
# Tuned these parameters via gridsearch
grad_boost_tuned = GradientBoostingClassifier(n_estimators=200, learning_rate=0.15, max_depth=10)
grad_boost_tuned.fit(X_train, y_train)

y_train_pred = grad_boost_tuned.predict(X_train)
y_val_pred = grad_boost_tuned.predict(X_val)

# Evaluate the performance
print(classification_report(y_train, y_train_pred))
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      3468
           1       0.98      0.87      0.93      2622

    accuracy                           0.94      6090
   macro avg       0.95      0.93      0.94      6090
weighted avg       0.94      0.94      0.94      6090

              precision    recall  f1-score   support

           0       0.79      0.87      0.83       874
           1       0.80      0.68      0.74       649

    accuracy                           0.79      1523
   macro avg       0.79      0.78      0.78      1523
weighted avg       0.79      0.79      0.79      1523



In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],    # Number of boosting stages
    'learning_rate': [0.1, 0.15, 0.2],  # Learning rate
    'max_depth': [5, 8, 10]             # Max depth of individual trees
}

gb_classifier = GradientBoostingClassifier()

# Perform grid search using cross-validation
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)

# Evaluate the model with the best parameters on the test set
best_model = grid_search.best_estimator_
val_accuracy = best_model.score(X_val, y_val)
print("Val accuracy with best parameters:", val_accuracy)