# Disaster Tweet Classification with LSTM and TF-IDF

This notebook classifies tweets related to disasters using Natural Language Processing (NLP) techniques. We will explore the data, clean it, and then use two different models to classify tweets: an LSTM model and a TF-IDF with Random Forest classifier.


### Configuration
Define key parameters for the models and data processing steps.


In [None]:
# Configuration
MAX_WORDS = 5000
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 64
TEST_SIZE = 0.2
RANDOM_STATE = 42
BATCH_SIZE = 32
EPOCHS = 5

### Step 1: Load Data
Load the training and test data.


In [None]:
# Load Data
def load_data(train_path='train.csv', test_path='test.csv'):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    return train_data, test_data

train_data, test_data = load_data()
train_data.head()


### Step 2: Exploratory Data Analysis (EDA)
- View distribution of disaster vs. non-disaster tweets.
- Examine tweet lengths.


In [None]:
# Plot Target Distribution and Tweet Length Distribution
def plot_distributions(train_data):
    plt.figure(figsize=(6, 4))
    sns.countplot(x='target', data=train_data)
    plt.title("Disaster (1) vs. Non-Disaster (0) Tweets Distribution")
    plt.show()

    train_data['text_length'] = train_data['text'].apply(len)
    plt.figure(figsize=(10, 6))
    sns.histplot(train_data['text_length'], bins=30, kde=True)
    plt.title("Tweet Length Distribution")
    plt.xlabel("Tweet Length")
    plt.ylabel("Frequency")
    plt.show()

plot_distributions(train_data)


### Step 3: Word Cloud Visualization
Generate a word cloud for disaster-related tweets to visualize the most common words.


In [None]:
# Generate Word Cloud for Disaster-Related Tweets
def plot_wordcloud(text_data):
    wordcloud = WordCloud(width=800, height=400, background_color='black').generate(" ".join(text_data))
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title("Common Words in Disaster-Related Tweets")
    plt.show()

plot_wordcloud(train_data[train_data['target'] == 1]['text'])


### Step 4: Data Cleaning
Remove URLs, punctuation, and stopwords from the tweets to prepare the text for modeling.


In [None]:
# Data Cleaning
def clean_text(text, stopwords):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return " ".join([word for word in text.split() if word not in stopwords])

def preprocess_data(train_data):
    stopwords = set(["i", "me", "my", "we", "our", "you", "your", "he", "she", "it", "they", "them", 
                     "what", "which", "who", "this", "that", "am", "is", "are", "was", "were", 
                     "be", "been", "have", "has", "do", "does", "did", "a", "an", "the", "and", 
                     "but", "if", "or", "because", "as", "until", "of", "at", "by", "for", "with", 
                     "about", "between", "into", "through", "during", "before", "after", "to", 
                     "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", 
                     "further", "then", "once", "here", "there", "when", "where", "why", "how", 
                     "all", "any", "both", "each", "few", "more", "most", "other", "some", "no", 
                     "not", "only", "same", "so", "too", "very", "can", "will", "just"])
    
    train_data['cleaned_text'] = train_data['text'].apply(lambda x: clean_text(x, stopwords))
    return train_data

train_data = preprocess_data(train_data)


### Step 5: Tokenization and Padding
Convert text into sequences and pad them to a fixed length for LSTM input.


In [None]:
def tokenize_and_pad_sequences(text_data, max_words, max_length):
    tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(text_data)
    sequences = tokenizer.texts_to_sequences(text_data)
    padded_sequences = pad_sequences(sequences, maxlen=max_length)
    return padded_sequences, tokenizer

X, tokenizer = tokenize_and_pad_sequences(train_data['cleaned_text'], MAX_WORDS, MAX_SEQUENCE_LENGTH)
X_train, X_val, y_train, y_val = train_test_split(X, train_data['target'], test_size=TEST_SIZE, random_state=RANDOM_STATE)


### Step 6: Build and Train LSTM Model
Use an LSTM model to learn patterns in the sequence data for tweet classification.


In [None]:
def build_lstm_model(input_dim, embedding_dim, input_length):
    model = Sequential([
        Embedding(input_dim=input_dim, output_dim=embedding_dim, input_length=input_length),
        SpatialDropout1D(0.2),
        LSTM(64, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

lstm_model = build_lstm_model(input_dim=MAX_WORDS, embedding_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)
history = lstm_model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_val, y_val), verbose=2)


### Step 7: Evaluate LSTM Model
Assess the performance of the LSTM model on the validation set.


In [None]:
def evaluate_model(model, X_val, y_val):
    loss, accuracy = model.evaluate(X_val, y_val, verbose=0)
    print(f"Validation Loss: {loss}")
    print(f"Validation Accuracy: {accuracy}")

evaluate_model(lstm_model, X_val, y_val)


### Step 8: TF-IDF with Random Forest Classifier
Use TF-IDF vectorization with a Random Forest classifier as a comparison to the LSTM model.


In [None]:
def train_tfidf_rf_model(train_data):
    tfidf = TfidfVectorizer(max_features=MAX_WORDS)
    X_tfidf = tfidf.fit_transform(train_data['cleaned_text']).toarray()
    X_train_tfidf, X_val_tfidf, y_train_tfidf, y_val_tfidf = train_test_split(X_tfidf, train_data['target'], test_size=TEST_SIZE, random_state=RANDOM_STATE)

    rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
    rf_model.fit(X_train_tfidf, y_train_tfidf)
    y_pred_tfidf = rf_model.predict(X_val_tfidf)
    
    print("Random Forest Classifier Results:")
    print("Accuracy:", accuracy_score(y_val_tfidf, y_pred_tfidf))
    print("Confusion Matrix:\n", confusion_matrix(y_val_tfidf, y_pred_tfidf))
    print("Classification Report:\n", classification_report(y_val_tfidf, y_pred_tfidf))

train_tfidf_rf_model(train_data)
