In [None]:
# Required libraries:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import matplotlib.patches as mpatches
import string
import re
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the cleaned_dataset:
df = pd.read_csv(r'C:\Users\HP\Desktop\Digi-crome\Project-7_NLP\Data\cleaned_twitter_disaster.csv')

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
print(df.keys())

In [None]:
# Dataset Structure
df.info()

In [None]:
df.describe()

# Task: Feature Engineering

In [None]:
# Extracting the relevant features:

# Extract the word frequency features
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the cleaned_text to get the word frequency matrix
X = vectorizer.fit_transform(df['cleaned_text'])


# Extract the  TF-IDF scores features
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the cleaned_text to get the TF-IDF matrix
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_text'])


In [None]:
X

In [None]:
X_tfidf

## Sentiments Analysis

In [None]:
# Extract the sentiment analysis features
from textblob import TextBlob
cleaned_text=df['cleaned_text']

# Function to perform sentiment analysis
def get_sentiment(cleaned_text):
    blob = TextBlob(cleaned_text)
    
    # Sentiment polarity (-1 to 1)
    polarity = blob.sentiment.polarity
    
    # Subjectivity (0 to 1)
    subjectivity = blob.sentiment.subjectivity
    
    return polarity, subjectivity

# Apply sentiment analysis to each text
sentiments = [get_sentiment(cleaned_text) for cleaned_text in cleaned_text]

# Convert results to a DataFrame
sentiment_df = pd.DataFrame(sentiments, columns=['Polarity', 'Subjectivity'])

In [None]:
sentiment_df

## Word Embeddings using GloVe

GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus and the resulting representations showcase interesting linear substructures of the word vector space.

In [None]:
df1= df[['text', 'cleaned_text']]
df1

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['cleaned_text'])

word_index = tokenizer.word_index
vocab_size = len(word_index)
vocab_size

In [None]:
max(len(data) for data in df['cleaned_text'])

In [None]:
# Padding cleaned_text data
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
padded_seq = pad_sequences(sequences, maxlen=131, padding='post', truncating='post')

In [None]:
padded_seq[0]

In [None]:
# create embedding index
embedding_index = {}

# create embedding matrix
embedding_matrix = np.zeros((vocab_size+1, 100))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix

In [None]:
embedding_matrix.shape

## Additional Features: Hashtags, Mentions etc.

In [None]:
# Function to calculate additional features
def extract_additional_features(text):
    num_hashtags = len(re.findall(r"#\w+", text))
    num_mentions = len(re.findall(r"@\w+", text))
    num_exclamations = len(re.findall(r"!\w+", text))
    return pd.Series([num_hashtags, num_mentions, num_exclamations], index=['num_hashtags', 'num_mentions', 'num_exclamations'])

In [None]:
# Apply function to both training and test sets
additional_future = df['text'].apply(extract_additional_features)
additional_future

## Split the dataset

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
X = df['cleaned_text']  
y = df['target'] 

In [None]:
# Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Use the 5000 most frequent words
X_tfidf = vectorizer.fit_transform(X)

print("TF-IDF shape:", X_tfidf.shape)

In [None]:
# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Print shapes of the resulting datasets
print(f"Training data size: {X_train.shape}")
print(f"Testing data size: {X_test.shape}")

# Task: Model Selection and Training

## Logistic Regression Model

In [None]:
# Create a Logistic Regression classifier
logreg = LogisticRegression(solver='liblinear')


# Train the model using the training data
logreg.fit(X_train, y_train)

In [None]:
X_test

In [None]:
y_test

In [None]:
X_train

In [None]:
y_train

In [None]:
# Make predictions on the test set
y_pred = logreg.predict(X_test)

In [None]:
# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Classification report (Precision, Recall, F1-Score)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

## Random Forest Classifier Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
random_model = RandomForestClassifier()
random_model.fit(X_train, y_train)
y_pred_ran = random_model.predict(X_test)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
print(classification_report(y_test, y_pred_ran))

In [None]:
rf

## Neural Network Model

In [None]:
# Importing some important features
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [None]:
# Create NN_Model
model = Sequential()
model.add(Dense(128, input_dim=X_tfidf.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Binary classification (0 or 1)
    
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    
# Wrap the Keras model into a scikit-learn compatible classifier
nn_model  = model.fit(X_train, y_train, epochs=10, batch_size=32)

In [None]:
# Evaluate the model test accuracy
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_accuracy:.2f}")

In [None]:
nn_model

# Optimize Hyperparameter Tuning Using Grid Search

In [None]:
**Logistic Regression Hyperparameter Tuning**

from sklearn.model_selection import GridSearchCV
logistic_model = LogisticRegression()
# Set up the parameter grid for Grid Search
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'], 
    'C': [0.01, 0.1, 1, 10, 100],                  
    'solver': ['liblinear', 'saga'],               
    'max_iter': [100, 200, 500]                    
}
# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(logistic_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

In [None]:
# Get the best model after tuning
best_logistic_model = grid_search.best_estimator_

In [None]:
# Make predictions with the best model
y_pred_log = best_logistic_model.predict(X_test)

In [None]:
# Print classification report
print("Best Parameters:", grid_search.best_params_)
print(classification_report(y_test, y_pred_log))

In [None]:
# Random Forest Hyperparameter Tuning

# Define the model
random_model = RandomForestClassifier(random_state=42)
# Set up the parameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],               
    'max_depth': [None, 10, 20, 30],             
    'min_samples_split': [2, 5, 10],              
    'min_samples_leaf': [1, 2, 4],                
    'max_features': ['sqrt', 'log2'],           
    'bootstrap': [True, False]                  
}
# Set up GridSearchCV with cross-validation
grid_search_ran = GridSearchCV(random_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

In [None]:
# Fit the grid search to the data
grid_search_ran.fit(X_train, y_train)

In [None]:
# Get the best model after tuning
best_random_model = grid_search_ran.best_estimator_

In [None]:
# Make predictions with the best model
y_pred_ran = best_random_model.predict(X_test)

In [None]:
# Print classification report
print("Best Parameters:", grid_search_ran.best_params_)
print(classification_report(y_test, y_pred_ran))

In [None]:
**Neural Network model Hyperparameter Tuning**

def create_model():
    model_neural_tue = Sequential()
    model_neural_tue.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
    model_neural_tue.add(Dense(1))  # Single output for regression
    model_neural_tue.compile(optimizer='adam', loss='mse')
    return model_neural_tue

In [None]:
from scikeras.wrappers import KerasRegressor
model_neural_tue = KerasRegressor(model=create_model, epochs=10, batch_size=10, verbose=0)
param_grid = {
    'batch_size': [10, 20, 40],
    'epochs': [10, 20, 50]
}

In [None]:
# Initialize RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=model_neural_tue, param_distributions=param_grid, n_iter=10, error_score='raise')

In [None]:
# Perform the random search
random_result = random_search.fit(X_train, y_train)

In [None]:
accuracy = random_result.score(X_test, y_test)
print(f'Accuracy: {accuracy:.4f}')