# Import Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re
import string
import numpy as np 
import random
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.offline import iplot

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk

import keras
from keras.models import Sequential
from keras.initializers import Constant
from keras.layers import (LSTM, 
                          Embedding, 
                          BatchNormalization,
                          Dense, 
                          TimeDistributed, 
                          Dropout, 
                          Bidirectional,
                          Flatten, 
                          GlobalMaxPool1D)
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam

from sklearn.metrics import (
    precision_score, 
    recall_score, 
    f1_score, 
    classification_report,
    accuracy_score
)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from keras.utils import pad_sequences
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

import xgboost as xgb
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.models import Model
from keras.optimizers import RMSprop

from keras.callbacks import EarlyStopping

# Load Data

In [None]:
df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv",
                delimiter=',',encoding='latin-1')
df.head()

In [None]:
df.info()

In [None]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1,inplace=True)
df.rename(columns = {"v1": "target", "v2": "text"}, inplace = True)

In [None]:
df.head()

# Exploratory Data Analysis (EDA)

In [None]:
x = ['ham', 'spam']
y = df.groupby("target")["target"].agg("count").values

In [None]:
layout = go.Layout(title={'text':'Proportional Distribution of the Target Variable',
                         'y':0.9,
                         'x':0.5,
                         'xanchor':'center',
                         'yanchor':'top'},
                  template = 'plotly_dark')

fig = go.Figure(data=[go.Bar(
    x = x, y = y,
    text = y, textposition = 'auto',
    marker_color = "slateblue"
)], layout = layout)
fig.show()

In [None]:
colors = ["slateblue", "darkred"]

fig = go.Figure(data=[go.Pie(labels = df['target'].value_counts().keys(),
                             values = df['target'].value_counts().values,
                             pull = [0, 0.25])])

fig.update_traces(hoverinfo ='label',
                  textinfo ='percent',
                  textfont_size = 20,
                  textposition ='auto',
                  marker=dict(colors=colors,
                              line = dict(color = 'lightgray',
                                          width = 1.5)))

fig.update_layout(title={'text': "Percentages of the Target Values",
                         'y':0.9,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'},
                  template='plotly_dark')

iplot(fig)

In [None]:
df['text_len'] = df['text'].apply(lambda x: len(x.split(' ')))

In [None]:
df.head()

In [None]:
ham = df[df["target"] == "ham"]["text_len"].value_counts().sort_index()
spam = df[df["target"] == "spam"]["text_len"].value_counts().sort_index()

fig = go.Figure()
fig.add_trace(go.Scatter(x = ham.index, y = ham.values, name = "ham", 
                         fill = "tozeroy"))
fig.add_trace(go.Scatter(x = spam.index, y = spam.values, name="spam",
                        fill = "tozeroy"))
fig.update_layout(title={'text': "Distributions of Target Values",
                         'y':0.9,
                         'x':0.5,
                         'xanchor': 'center',
                         'yanchor': 'top'},
                  template='plotly_dark')
fig.update_xaxes(range=[0, 50])
fig.update_yaxes(range=[0, 450])
fig.show()

# Data Preprocessing

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
df['text'] = df['text'].apply(clean_text)

In [None]:
df.head()

In [None]:
# df["text"]=df.apply(lambda x: nltk.word_tokenize(x["text"]), axis=1)

In [None]:
stop_words = stopwords.words("english")
df["text"] = df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

In [None]:
df.head()

In [None]:
stemmer = nltk.SnowballStemmer("english")
df["text"] = df["text"].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))

In [None]:
df.head()

In [None]:
text = " ".join(i for i in df.text)

wc = WordCloud(background_color = "black", width = 1200, height = 600,
               contour_width = 0, contour_color = "#410F01", max_words = 1000,
               scale = 1, collocations = False, repeat = True, min_font_size = 1)

wc.generate(text)

plt.figure(figsize = [15, 7])
plt.title("Top Words in the Text")
plt.imshow(wc)
plt.axis("off")
plt.show

In [None]:
lb = LabelEncoder()
df["target"] = lb.fit_transform(df["target"])

In [None]:
df.head()

In [None]:
X = df['text']
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

vec = CountVectorizer()
vec.fit(X_train)

X_train_dtm = vec.transform(X_train)
X_test_dtm = vec.transform(X_test)

In [None]:
max_words = 1000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = pad_sequences(sequences,maxlen=max_len)

# Models

In [None]:
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)

y_pred_class = nb.predict(X_test_dtm)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]

print(metrics.accuracy_score(y_test, y_pred_class))

In [None]:
pipe = Pipeline([
    ('bow', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', xgb.XGBClassifier(
        learning_rate=0.1,
        max_depth=7,
        n_estimators=80,
        use_label_encoder=False,
        eval_metric='auc',
    ))
])

In [None]:
pipe.fit(X_train, y_train)

y_pred_class = pipe.predict(X_test)
y_pred_train = pipe.predict(X_train)

print('Train: {}'.format(metrics.accuracy_score(y_train, y_pred_train)))
print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))

In [None]:
def LSTM_model():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [None]:
model = LSTM_model()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [None]:
model.fit(sequences_matrix,y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

In [None]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = pad_sequences(test_sequences,maxlen=max_len)

In [None]:
accr = model.evaluate(test_sequences_matrix,y_test)