In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [3]:
nltk.download('stopwords')
df = pd.read_csv("disaster_tweets_data(DS).csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\noo89\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df.dropna(inplace=True)

In [5]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    text = re.sub(r'[^a-z\s]', '', text) # Remove punctuation/numbers
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if w not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    return ' '.join(words)

In [6]:
df['cleaned_tweet'] = df['tweets'].apply(preprocess)

In [7]:
cv = CountVectorizer()
X = cv.fit_transform(df['cleaned_tweet'])

In [8]:
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
def train_and_evaluate(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"------ {model_name} ------")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

train_and_evaluate(MultinomialNB(), "Multinomial Naive Bayes")
train_and_evaluate(LogisticRegression(max_iter=1000), "Logistic Regression")
train_and_evaluate(KNeighborsClassifier(), "KNN Classifier")

------ Multinomial Naive Bayes ------
Accuracy: 0.8030203545633617
Confusion Matrix:
 [[746 128]
 [172 477]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.85      0.83       874
           1       0.79      0.73      0.76       649

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.80      1523
weighted avg       0.80      0.80      0.80      1523

------ Logistic Regression ------
Accuracy: 0.7820091923834537
Confusion Matrix:
 [[732 142]
 [190 459]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.84      0.82       874
           1       0.76      0.71      0.73       649

    accuracy                           0.78      1523
   macro avg       0.78      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523

------ KNN Classifier ------
Accuracy: 0.587655942219304
Confusion Matrix:
 [[389 48