In [None]:
!pip install pandas numpy scikit-learn tqdm imbalanced-learn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from imblearn.over_sampling import SMOTE
from tqdm import tqdm
import re
import warnings
warnings.filterwarnings('ignore')



In [None]:

# Load data
print("Loading data...")
data = pd.read_csv(r'/content/cleaned_tweets (3).csv')
print(f"Data loaded: {len(data)} tweets")

Loading data...
Data loaded: 14640 tweets


In [None]:

# Enhanced text cleaning
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower().strip()
    return text

print("Cleaning text...")
data['final_text'] = data['text'].apply(clean_text)

Cleaning text...


In [None]:

# Enhanced Crisis Detection using both keywords and sentiment
def enhanced_crisis_detection(tweet, sentiment):
    """
    Enhanced crisis detection combining keywords and original sentiment
    """
    high_crisis_keywords = [
        'canceled', 'cancelled', 'stranded', 'emergency', 'disaster',
        'horrible', 'terrible', 'worst', 'chaos', 'nightmare', 'awful',
        'refund', 'compensation', 'lost luggage', 'missed connection',
        'hours delayed', 'never again', 'disgusting', 'furious'
    ]

    medium_crisis_keywords = [
        'delayed', 'late', 'poor service', 'bad experience',
        'unhappy', 'disappointed', 'frustrated', 'annoyed'
    ]

    tweet_lower = tweet.lower()

    # Check for high crisis indicators
    high_crisis_count = sum(1 for keyword in high_crisis_keywords if keyword in tweet_lower)
    medium_crisis_count = sum(1 for keyword in medium_crisis_keywords if keyword in tweet_lower)

    # Crisis scoring system
    crisis_score = 0

    # Keyword scoring
    crisis_score += high_crisis_count * 3
    crisis_score += medium_crisis_count * 1

    # Sentiment scoring
    if sentiment == 'negative':
        crisis_score += 2
    elif sentiment == 'neutral':
        crisis_score += 0.5

    # Classification thresholds
    if crisis_score >= 4:
        return 'high_crisis'
    elif crisis_score >= 2:
        return 'medium_crisis'
    else:
        return 'non_crisis'


In [None]:
# Apply enhanced crisis detection
print("Detecting crisis levels...")
data['crisis_level'] = data.apply(
    lambda row: enhanced_crisis_detection(row['final_text'], row['airline_sentiment']),
    axis=1
)

Detecting crisis levels...


In [None]:

# Convert to binary for initial model
def crisis_to_binary(crisis_level):
    if crisis_level in ['high_crisis', 'medium_crisis']:
        return 1
    return 0

data['crisis_binary'] = data['crisis_level'].apply(crisis_to_binary)


In [None]:

# Check class distribution
print("Crisis Level Distribution:")
print(data['crisis_level'].value_counts())
print(f"\nBinary Crisis Distribution:")
print(data['crisis_binary'].value_counts())
print(f"Crisis Percentage: {(data['crisis_binary'].sum() / len(data)) * 100:.2f}%")


Crisis Level Distribution:
crisis_level
medium_crisis    7753
non_crisis       5264
high_crisis      1623
Name: count, dtype: int64

Binary Crisis Distribution:
crisis_binary
1    9376
0    5264
Name: count, dtype: int64
Crisis Percentage: 64.04%


In [None]:

# FAST Feature Engineering - Reduced parameters for speed
print("Creating features...")
vectorizer = TfidfVectorizer(
    max_features=2000,      # Reduced for speed
    ngram_range=(1, 2),     # Bigrams only
    min_df=5,              # Higher threshold
    max_df=0.9,            # Remove very common words
    stop_words='english'
)

X = vectorizer.fit_transform(data['final_text']).toarray()
y = data['crisis_binary'].values

print(f"Feature matrix shape: {X.shape}")


Creating features...
Feature matrix shape: (14640, 2000)


In [None]:

# Check for class imbalance
unique, counts = np.unique(y, return_counts=True)
print(f"Class Distribution: {dict(zip(unique, counts))}")

# Split data
print("Splitting data...")
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

Class Distribution: {np.int64(0): np.int64(5264), np.int64(1): np.int64(9376)}
Splitting data...


In [None]:

# Handle class imbalance with SMOTE
print("Balancing data with SMOTE...")
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"After SMOTE - Training set balance:")
unique, counts = np.unique(y_train_balanced, return_counts=True)
print(dict(zip(unique, counts)))

Balancing data with SMOTE...
After SMOTE - Training set balance:
{np.int64(0): np.int64(6563), np.int64(1): np.int64(6563)}


In [None]:

# FASTEST VERSION - Skip GridSearch entirely!
print("Training Random Forest with optimized fixed parameters...")

best_rf_model = RandomForestClassifier(
    n_estimators=200,        # Good balance of speed/accuracy
    max_depth=20,           # Prevents overfitting
    min_samples_split=5,    # Good default
    min_samples_leaf=2,     # Prevents overfitting
    max_features='sqrt',    # Fastest option
    class_weight='balanced', # Handle imbalance
    random_state=42,
    n_jobs=-1,              # Use all CPU cores
    verbose=1               # Show progress
)

Training Random Forest with optimized fixed parameters...


In [None]:

# Fast direct training
best_rf_model.fit(X_train_balanced, y_train_balanced)
print("Model training completed!")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   25.4s


Model training completed!


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   26.2s finished


In [None]:

# Enhanced evaluation function
def evaluate_model(model, X_val, y_val, X_test, y_test):
    """Enhanced model evaluation with multiple metrics"""

    for X_set, y_set, name in [(X_val, y_val, "Validation"), (X_test, y_test, "Test")]:
        preds = model.predict(X_set)
        proba = model.predict_proba(X_set)[:, 1]

        print(f"\n{name} Set Results:")
        print("-" * 40)
        print(f"Accuracy: {accuracy_score(y_set, preds):.4f}")
        print(f"ROC-AUC: {roc_auc_score(y_set, proba):.4f}")

        # Detailed classification report
        precision, recall, f1, support = precision_recall_fscore_support(y_set, preds, average=None)
        print(f"\nPer-class metrics:")
        print(f"Non-Crisis - Precision: {precision[0]:.4f}, Recall: {recall[0]:.4f}, F1: {f1[0]:.4f}")
        print(f"Crisis - Precision: {precision[1]:.4f}, Recall: {recall[1]:.4f}, F1: {f1[1]:.4f}")

        print(f"\nClassification Report:")
        print(classification_report(y_set, preds, target_names=['Non-Crisis', 'Crisis']))


In [None]:

# Evaluate the model
print("Evaluating model...")
evaluate_model(best_rf_model, X_val, y_val, X_test, y_test)


Evaluating model...


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.



Validation Set Results:
----------------------------------------
Accuracy: 0.7368
ROC-AUC: 0.8515

Per-class metrics:
Non-Crisis - Precision: 0.5955, Recall: 0.8340, F1: 0.6948
Crisis - Precision: 0.8799, Recall: 0.6823, F1: 0.7686

Classification Report:
              precision    recall  f1-score   support

  Non-Crisis       0.60      0.83      0.69       789
      Crisis       0.88      0.68      0.77      1407

    accuracy                           0.74      2196
   macro avg       0.74      0.76      0.73      2196
weighted avg       0.78      0.74      0.74      2196



[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s



Test Set Results:
----------------------------------------
Accuracy: 0.7464
ROC-AUC: 0.8528

Per-class metrics:
Non-Crisis - Precision: 0.6041, Recall: 0.8557, F1: 0.7082
Crisis - Precision: 0.8942, Recall: 0.6849, F1: 0.7757

Classification Report:
              precision    recall  f1-score   support

  Non-Crisis       0.60      0.86      0.71       790
      Crisis       0.89      0.68      0.78      1406

    accuracy                           0.75      2196
   macro avg       0.75      0.77      0.74      2196
weighted avg       0.79      0.75      0.75      2196



[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished


In [None]:

# Feature importance analysis
print("Analyzing feature importance...")
feature_names = vectorizer.get_feature_names_out()
feature_importance = best_rf_model.feature_importances_
top_indices = np.argsort(feature_importance)[-20:][::-1]

print("\nTop 20 Most Important Features:")
print("-" * 40)
for i, idx in enumerate(top_indices):
    print(f"{i+1:2d}. {feature_names[idx]:20s} - {feature_importance[idx]:.4f}")


Analyzing feature importance...

Top 20 Most Important Features:
----------------------------------------
 1. cancelled            - 0.0663
 2. thank                - 0.0473
 3. flight               - 0.0473
 4. thanks               - 0.0441
 5. hours                - 0.0427
 6. hold                 - 0.0353
 7. cancelled flightled  - 0.0254
 8. delayed              - 0.0230
 9. hour                 - 0.0222
10. flightled            - 0.0218
11. great                - 0.0196
12. hrs                  - 0.0174
13. plane                - 0.0150
14. worst                - 0.0149
15. phone                - 0.0146
16. service              - 0.0133
17. waiting              - 0.0132
18. customer             - 0.0130
19. dont                 - 0.0124
20. delay                - 0.0120


In [None]:

# Enhanced prediction function
def predict_crisis_with_confidence(tweets, model, vectorizer):
    """
    Predict crisis with confidence scores
    """
    try:
        cleaned_tweets = [clean_text(tweet) for tweet in tweets]
        features = vectorizer.transform(cleaned_tweets).toarray()

        predictions = model.predict(features)
        probabilities = model.predict_proba(features)

        results = []
        for tweet, pred, prob in zip(tweets, predictions, probabilities):
            crisis_prob = prob[1]
            confidence = max(prob)

            result = {
                'tweet': tweet,
                'prediction': 'Crisis' if pred == 1 else 'Non-Crisis',
                'crisis_probability': crisis_prob,
                'confidence': confidence,
                'severity': 'High' if crisis_prob > 0.8 else 'Medium' if crisis_prob > 0.5 else 'Low'
            }
            results.append(result)

        return results

    except Exception as e:
        print(f"Error in prediction: {e}")
        return []


In [None]:

# Test with example tweets
example_tweets = [
    "Flight canceled by Delta, I'm stranded at the airport for 8 hours!",
    "Slight delay with United, but crew was helpful",
    "Great experience with Southwest, smooth flight",
    "Emergency landing due to technical failure, terrible experience",
    "Lost my luggage and no one can help me, this is a nightmare!",
    "Food was okay, nothing special but acceptable service"
]

print("\n" + "="*60)
print("CRISIS PREDICTION RESULTS")
print("="*60)

try:
    predictions = predict_crisis_with_confidence(example_tweets, best_rf_model, vectorizer)

    if predictions:  # Check if we got results
        for result in predictions:
            print(f"\nTweet: {result['tweet']}")
            print(f"Prediction: {result['prediction']}")
            print(f"Crisis Probability: {result['crisis_probability']:.3f}")
            print(f"Confidence: {result['confidence']:.3f}")
            print(f"Severity Level: {result['severity']}")
            print("-" * 50)
    else:
        print("No predictions generated. Check for errors above.")

except Exception as e:
    print(f"Error during prediction: {e}")
    print("Let's try a simpler prediction method...")



CRISIS PREDICTION RESULTS

Tweet: Flight canceled by Delta, I'm stranded at the airport for 8 hours!
Prediction: Crisis
Crisis Probability: 0.680
Confidence: 0.680
Severity Level: Medium
--------------------------------------------------

Tweet: Slight delay with United, but crew was helpful
Prediction: Crisis
Crisis Probability: 0.556
Confidence: 0.556
Severity Level: Medium
--------------------------------------------------

Tweet: Great experience with Southwest, smooth flight
Prediction: Non-Crisis
Crisis Probability: 0.385
Confidence: 0.615
Severity Level: Low
--------------------------------------------------

Tweet: Emergency landing due to technical failure, terrible experience
Prediction: Crisis
Crisis Probability: 0.512
Confidence: 0.512
Severity Level: Medium
--------------------------------------------------

Tweet: Lost my luggage and no one can help me, this is a nightmare!
Prediction: Crisis
Crisis Probability: 0.632
Confidence: 0.632
Severity Level: Medium
------------

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished


In [None]:

    # Simple fallback prediction
    for tweet in example_tweets:
        cleaned = clean_text(tweet)
        features = vectorizer.transform([cleaned]).toarray()
        pred = best_rf_model.predict(features)[0]
        prob = best_rf_model.predict_proba(features)[0]

        print(f"\nTweet: {tweet}")
        print(f"Prediction: {'Crisis' if pred == 1 else 'Non-Crisis'}")
        print(f"Crisis Probability: {prob[1]:.3f}")
        print("-" * 50)


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s



Tweet: Flight canceled by Delta, I'm stranded at the airport for 8 hours!
Prediction: Crisis
Crisis Probability: 0.680
--------------------------------------------------

Tweet: Slight delay with United, but crew was helpful
Prediction: Crisis
Crisis Probability: 0.556
--------------------------------------------------


[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend 


Tweet: Great experience with Southwest, smooth flight
Prediction: Non-Crisis
Crisis Probability: 0.385
--------------------------------------------------

Tweet: Emergency landing due to technical failure, terrible experience
Prediction: Crisis
Crisis Probability: 0.512
--------------------------------------------------


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s



Tweet: Lost my luggage and no one can help me, this is a nightmare!
Prediction: Crisis
Crisis Probability: 0.632
--------------------------------------------------

Tweet: Food was okay, nothing special but acceptable service
Prediction: Non-Crisis
Crisis Probability: 0.477
--------------------------------------------------


[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 200 out of 200 | elapsed:    0.1s finished


In [None]:

# Save the model and vectorizer
import joblib
print("\nSaving model and vectorizer...")
joblib.dump(best_rf_model, 'crisis_detection_model.pkl')
joblib.dump(vectorizer, 'crisis_vectorizer.pkl')
print("Model and vectorizer saved successfully!")

print("\n" + "="*60)
print("TRAINING COMPLETED SUCCESSFULLY!")
print("="*60)
print("Your crisis detection model is ready to use!")
print("Files saved:")
print("- crisis_detection_model.pkl")
print("- crisis_vectorizer.pkl")


Saving model and vectorizer...
Model and vectorizer saved successfully!

TRAINING COMPLETED SUCCESSFULLY!
Your crisis detection model is ready to use!
Files saved:
- crisis_detection_model.pkl
- crisis_vectorizer.pkl
