# Disaster Text Classification Model
Train a model to predict disaster types from text descriptions

In [51]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [52]:
# Load data
df = pd.read_csv('disasters/All the Texts.txt')
print(f"Dataset shape: {df.shape}")
print(f"\nClass distribution:\n{df['label'].value_counts()}")

Dataset shape: (581, 2)

Class distribution:
label
Fire          65
Earthquake    65
Hurricane     65
Flood         63
Landslide     61
Accident      61
Drought       57
Tsunami       57
Name: count, dtype: int64


In [53]:
# Clean data
df = df.dropna()

# Prepare data
X = df['text']
y = df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [54]:
# Vectorize text
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [55]:
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_vec, y_train)

# Predictions
y_pred = model.predict(X_test_vec)

# Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")

Accuracy: 0.808

Classification Report:
              precision    recall  f1-score   support

    Accident       1.00      0.58      0.74        12
     Drought       0.92      1.00      0.96        12
  Earthquake       0.92      0.85      0.88        13
        Fire       0.58      0.85      0.69        13
       Flood       0.90      0.69      0.78        13
   Hurricane       0.62      0.62      0.62        13
   Landslide       1.00      1.00      1.00        12
     Tsunami       0.77      0.91      0.83        11

    accuracy                           0.81        99
   macro avg       0.84      0.81      0.81        99
weighted avg       0.84      0.81      0.81        99



In [56]:
# Save model and vectorizer
joblib.dump(model, 'disaster_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
print("Model saved successfully!")

Model saved successfully!


In [57]:
# Prediction function
def predict_disaster(text):
    text_vec = vectorizer.transform([text])
    prediction = model.predict(text_vec)[0]
    probability = model.predict_proba(text_vec)[0].max()
    return prediction, probability

# Test predictions
test_texts = [
    "The building collapsed after strong tremors",
    "Water levels are rising rapidly in the city",
    "Smoke is spreading across the forest"
]

for text in test_texts:
    pred, prob = predict_disaster(text)
    print(f"Text: '{text}'")
    print(f"Prediction: {pred} (Confidence: {prob:.3f})\n")

Text: 'The building collapsed after strong tremors'
Prediction: Earthquake (Confidence: 0.520)

Text: 'Water levels are rising rapidly in the city'
Prediction: Drought (Confidence: 0.530)

Text: 'Smoke is spreading across the forest'
Prediction: Fire (Confidence: 0.970)



In [58]:
# Improved model for 80%+ accuracy
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

# Better text preprocessing
vectorizer_improved = TfidfVectorizer(max_features=500, stop_words='english', ngram_range=(1, 2))
X_train_improved = vectorizer_improved.fit_transform(X_train)
X_test_improved = vectorizer_improved.transform(X_test)

# Ensemble model
ensemble = VotingClassifier([
    ('svm', SVC(kernel='linear', probability=True, random_state=42)),
    ('nb', MultinomialNB()),
    ('lr', LogisticRegression(random_state=42, max_iter=1000))
], voting='soft')

ensemble.fit(X_train_improved, y_train)
y_pred_improved = ensemble.predict(X_test_improved)

print(f"Improved Accuracy: {accuracy_score(y_test, y_pred_improved):.3f}")
print(f"\nImproved Report:\n{classification_report(y_test, y_pred_improved)}")

joblib.dump(ensemble, 'disaster_model_improved.pkl')
joblib.dump(vectorizer_improved, 'vectorizer_improved.pkl')

Improved Accuracy: 0.758

Improved Report:
              precision    recall  f1-score   support

    Accident       0.86      0.50      0.63        12
     Drought       0.90      0.75      0.82        12
  Earthquake       0.92      0.85      0.88        13
        Fire       0.58      0.85      0.69        13
       Flood       0.62      0.62      0.62        13
   Hurricane       0.75      0.69      0.72        13
   Landslide       0.86      1.00      0.92        12
     Tsunami       0.75      0.82      0.78        11

    accuracy                           0.76        99
   macro avg       0.78      0.76      0.76        99
weighted avg       0.78      0.76      0.76        99



['vectorizer_improved.pkl']

In [59]:
# Test improved model
def predict_disaster_improved(text):
    text_vec = vectorizer_improved.transform([text])
    prediction = ensemble.predict(text_vec)[0]
    probability = ensemble.predict_proba(text_vec)[0].max()
    return prediction, probability

test_texts = [
    "The building collapsed after strong tremors",
    "Water levels are rising rapidly",
    "Smoke is spreading across the forest"
]

for text in test_texts:
    pred, prob = predict_disaster_improved(text)
    print(f"'{text}' -> {pred} ({prob:.3f})")

'The building collapsed after strong tremors' -> Earthquake (0.586)
'Water levels are rising rapidly' -> Flood (0.533)
'Smoke is spreading across the forest' -> Fire (0.673)
