<center>
  <h1 style="background-color:red; color:white; font-size:40px; font-weight:bold; font-family:Times New Roman; border:3px solid black;">Natural Language Processing with Disaster Tweets</h1>
</center>

<div style="font-family: Cambria; font-weight: bold; letter-spacing: 0px; color: #ffffff; font-size: 120%; text-align: left; padding: 3.0px; background: #003380; border: 10px solid #80ffff;">
    <h1>TABLE OF CONTENTS</h1>
</div>

* [Import Libraries](#0)
* [Read Data](#1)
* [Text Cleaning](#2)
* [Data Visualization](#3)
* [Vectorization & Model](#4)
    * [CountVectorizer](#4.1)
    * [TfidfVectorizer](#4.2)
    * [Word2Vec](#4.3)
    * [Glove](#4.4)
* [Model Check](#5)
* [Submission](#6)
    

<a id="0.1"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" > Import Libraries<br><div> 


In [None]:
import pandas as pd
import numpy as np
import re
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,SpatialDropout1D,Embedding
from keras.callbacks import ModelCheckpoint

<a id="0.1"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" >Read Data<br><div> 

In [None]:
train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv',usecols=[0,3,4])
test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv',usecols=[0,3])
train.head()

In [None]:
train.info()

In [None]:
train.isna().sum()

In [None]:
train[train.duplicated()]

In [None]:
Text=train['text']
label=train['target']
Text.head()

<a id="0.1"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" >Text Cleaning<br><div> 

In [None]:
def clean_text(text):
    # Remove Twitter handles starting with '@'
    text = re.sub(r'@\w+', '', text)
    # Remove non-alphanumeric characters and extra whitespace
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert multiple whitespace characters to a single space
    text = re.sub(r'\s+', ' ', text)
    # Convert the text to lowercase
    text = text.lower()
    return text

In [None]:
Text=Text.apply(clean_text)
test['text']=test['text'].apply(clean_text)
Text.head()

<a id="0.1"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" >Data Visualization<br><div> 

In [None]:
label.value_counts()

In [None]:
sns.countplot(x=label,palette='Blues')
plt.title('Distribution Of Target',fontsize=20)
plt.xlabel('Target',fontsize=20)
plt.ylabel('Count',fontsize=20)
plt.grid(True)
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(30,20))
plt.imshow(WordCloud(background_color = 'black').generate(" ".join(Text)))
plt.axis("off")
plt.title("WordCloud For Text",fontsize=20)
plt.show()   

<a id="0.1"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" >Vectorization & Model<br><div> 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Text,label, test_size=0.1, random_state=44, shuffle =True,stratify=label)
print('X_train shape is ' , X_train.shape)
print('X_test shape is ' , X_test.shape)
print('y_train shape is ' , y_train.shape)
print('y_test shape is ' , y_test.shape)

In [None]:
count_pip=Pipeline(
                [
                    ('count',CountVectorizer(ngram_range=(1,1))),
                    ('model',LogisticRegression(C=.8,solver='sag',max_iter=1000))
                ]
                )
count_pip.fit(X_train,y_train)

In [None]:
print("CountVectorizer Model Train Score is :",count_pip.score(X_train,y_train))
print("Count Model Test Score is :",count_pip.score(X_test,y_test))

In [None]:
idf_pip=Pipeline(
                [
                    ('tf_idf',TfidfVectorizer(ngram_range=(1,1))),
                    ('model',LogisticRegression(C=.8,solver='sag',max_iter=1000))
                ]
                )
idf_pip.fit(X_train,y_train)

In [None]:
print("TfidfVectorizer Model Train Score is :",idf_pip.score(X_train,y_train))
print("TfidfVectorizer Model Test Score is :",idf_pip.score(X_test,y_test))

In [None]:
class Word2VecVectorizer:
    def __init__(self, word2vec_model):
        self.word2vec_model = word2vec_model
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        word2vec_vectors = []
        for tokens in X:
            vector = np.mean([self.word2vec_model.wv[word] for word in tokens if word in self.word2vec_model.wv], axis=0)
            word2vec_vectors.append(vector)
        return np.array(word2vec_vectors)
word2vec_model = Word2Vec(sentences=X_train, vector_size=100)
word2vec_pip = Pipeline([
    ('word2vec', Word2VecVectorizer(word2vec_model)),
    ('model', LogisticRegression(C=0.8, solver='sag', max_iter=1000))
])
word2vec_pip.fit(X_train, y_train)

In [None]:
print("Word2VecVectorizer Model Train Score is :",word2vec_pip.score(X_train,y_train))
print("Word2VecVectorizer Model Test Score is :",word2vec_pip.score(X_test,y_test))

In [None]:
glove_embeddings = {}
with open('/kaggle/input/glove6b/glove.6B.100d.txt', encoding='utf-8') as glove_file:
    for line in glove_file:
        parts = line.strip().split()
        word = parts[0]
        embedding = np.array([float(val) for val in parts[1:]])
        glove_embeddings[word] = embedding
class GloVeVectorizer:
    def __init__(self, glove_embeddings):
        self.glove_embeddings = glove_embeddings
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        glove_vectors = []
        for tokens in X:
            vector = np.mean([self.glove_embeddings.get(word, np.zeros(100)) for word in tokens], axis=0)
            glove_vectors.append(vector)
        return np.array(glove_vectors)
from sklearn.ensemble import RandomForestClassifier
glove_pip = Pipeline([
    ('glove', GloVeVectorizer(glove_embeddings)),
    ('model', LogisticRegression(C=0.8, solver='sag', max_iter=1000))
])
glove_pip.fit(X_train, y_train)

In [None]:
print("GloveVectorizer Model Train Score is :",glove_pip.score(X_train,y_train))
print("GloveVecVectorizer Model Test Score is :",glove_pip.score(X_test,y_test))

<a id="0.1"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" >Model Check<br><div> 


In [None]:
y_pred=idf_pip.predict(X_test)
CM = confusion_matrix(y_test, y_pred)
sns.heatmap(CM,annot=True,center = True,fmt='g',cmap='Blues')
CM

In [None]:
ClassificationReport = classification_report(y_test,y_pred)
print('Classification Report is : ', ClassificationReport) 

<a id="0.1"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:#ffffff; font-size:120%; text-align:left;padding:3.0px; background: #0059b3; border-bottom: 8px solid #e6e6e6" >Submission<br><div> 


In [None]:
y_idf=idf_pip.predict(test['text'])
df = pd.DataFrame({'target': y_idf}, index=test['id'])
df.index.name = 'id'
df.to_csv('/kaggle/working/idf_pred.csv')
df

In [None]:
y_count=count_pip.predict(test['text'])
df = pd.DataFrame({'target': y_count}, index=test['id'])
df.index.name = 'id'
df.to_csv('/kaggle/working/count_pred.csv')
df

<center>
  <h1 style="background-color:red; color:white; font-size:40px; font-weight:bold; font-family:Times New Roman; border:3px solid black;">Upvotes</h1>
</center>