<a href="https://colab.research.google.com/github/bunny346/Natural-language-Processing/blob/main/NLP_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import re
import nltk
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df = pd.read_csv("/content/archive (5).zip")

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = [w for w in text.split() if w not in stop_words]
    return " ".join(tokens)

df['clean'] = df['text'].astype(str).apply(clean_text)

X = df['clean']
y = df['target'] # Corrected column name from 'label' to 'target'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def run_ann(ngram_range):
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=10000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    model = Sequential([
        Dense(256, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train_tfidf.toarray(), y_train, epochs=5, batch_size=64, verbose=0,
              validation_data=(X_test_tfidf.toarray(), y_test))
    train_acc = model.evaluate(X_train_tfidf.toarray(), y_train, verbose=0)[1]
    test_acc = model.evaluate(X_test_tfidf.toarray(), y_test, verbose=0)[1]
    return train_acc, test_acc

print("\n=== ANN with TF-IDF ===")
uni_acc = run_ann((1,1))
print("Unigram -> Train: %.4f, Test: %.4f" % uni_acc)
bi_acc  = run_ann((1,2))
print("Unigram+Bigram -> Train: %.4f, Test: %.4f" % bi_acc)
tri_acc = run_ann((1,3))
print("Unigram+Bigram+Trigram -> Train: %.4f, Test: %.4f" % tri_acc)

MAX_NB_WORDS = 20000
MAX_SEQ_LEN = 100
EMB_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_SEQ_LEN)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_SEQ_LEN)

def run_lstm():
    model = Sequential([
        Embedding(MAX_NB_WORDS, EMB_DIM, input_length=MAX_SEQ_LEN),
        SpatialDropout1D(0.2),
        LSTM(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train_seq, y_train, epochs=5, batch_size=64,
              validation_data=(X_test_seq, y_test), verbose=0)
    train_acc = model.evaluate(X_train_seq, y_train, verbose=0)[1]
    test_acc = model.evaluate(X_test_seq, y_test, verbose=0)[1]
    return train_acc, test_acc

print("\n=== LSTM with Embeddings ===")
lstm_acc = run_lstm()
print("LSTM -> Train: %.4f, Test: %.4f" % lstm_acc)

print("\n=== Analysis ===")
print("Bigrams usually outperform unigrams because phrases like 'fire alarm' or 'flood warning' carry more meaning than single words.")
print("Trigrams often add sparsity and risk of overfitting.")
print("LSTM already learns sequential dependencies, so explicit bigrams/trigrams help less.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



=== ANN with TF-IDF ===


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Unigram -> Train: 0.9966, Test: 0.8857
Unigram+Bigram -> Train: 0.9958, Test: 0.8813
Unigram+Bigram+Trigram -> Train: 0.9959, Test: 0.8813

=== LSTM with Embeddings ===




LSTM -> Train: 0.9953, Test: 0.9006

=== Analysis ===
Trigrams often add sparsity and risk of overfitting.
LSTM already learns sequential dependencies, so explicit bigrams/trigrams help less.
