<a href="https://colab.research.google.com/github/c-w-m/anlp-tf2/blob/master/chapter1-nlp-essentials/SMS_Spam_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Note that this notebook should be uploaded to Google Colab and run there

In [None]:
#%tensorflow_version 2.x
import tensorflow as tf
#from tf.keras.models import Sequential
#from tf.keras.layers import Dense
import os
import io

tf.__version__

'2.4.1'

# Download Data

In [None]:
# Download the zip file
path_to_zip = tf.keras.utils.get_file("smsspamcollection.zip",
                  origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip",
                  extract=True)

# Unzip the file into a folder
!unzip $path_to_zip -d data

Archive:  /root/.keras/datasets/smsspamcollection.zip
replace data/SMSSpamCollection? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# optional step - helps if colab gets disconnected
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Test data reading
lines = io.open('/content/drive/My Drive/colab-data/SMSSpamCollection').read().strip().split('\n')
#lines = io.open('/content/data/SMSSpamCollection').read().strip().split('\n')
#lines = io.open('data/SMSSpamCollection').read().strip().split('\n')
lines[0]

# Pre-Process Data

In [None]:
spam_dataset = []
count = 0
for line in lines:
  label, text = line.split('\t')
  if label.lower().strip() == 'spam':
    spam_dataset.append((1, text.strip()))
    count += 1
  else:
    spam_dataset.append(((0, text.strip())))

print(spam_dataset[0])
print("Spam: ", count)

# Data Normalization

In [None]:
import pandas as pd 

In [None]:
df = pd.DataFrame(spam_dataset, columns=['Spam', 'Message'])

In [None]:
import re

# Normalization functions

def message_length(x):
  # returns total number of characters
  return len(x)

def num_capitals(x):
  _, count = re.subn(r'[A-Z]', '', x) # only works in english
  return count

def num_punctuation(x):
  _, count = re.subn(r'\W', '', x)
  return count



In [None]:
df['Capitals'] = df['Message'].apply(num_capitals)
df['Punctuation'] = df['Message'].apply(num_punctuation)
df['Length'] = df['Message'].apply(message_length)

In [None]:
df.describe()

In [None]:
train=df.sample(frac=0.8,random_state=42) #random state is a seed value
test=df.drop(train.index)

In [None]:
train.describe()

In [None]:
test.describe()

# Model Building

In [None]:
# Basic 1-layer neural network model for evaluation
def make_model(input_dims=3, num_units=12):
  model = tf.keras.Sequential()

  # Adds a densely-connected layer with 12 units to the model:
  model.add(tf.keras.layers.Dense(num_units, 
                                  input_dim=input_dims, 
                                  activation='relu'))

  # Add a sigmoid layer with a binary output unit:
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', 
                metrics=['accuracy'])
  return model

In [None]:
x_train = train[['Length', 'Punctuation', 'Capitals']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals']]
y_test = test[['Spam']]

In [None]:
x_train

In [None]:
model = make_model()

In [None]:
model.fit(x_train, y_train, epochs=10, batch_size=10)

In [None]:
model.evaluate(x_test, y_test)

In [None]:
y_train_pred = model.predict_classes(x_train)

In [None]:
# confusion matrix
tf.math.confusion_matrix(tf.constant(y_train.Spam), 
                         y_train_pred)

In [None]:
sum(y_train_pred)

In [None]:
y_test_pred = model.predict_classes(x_test)
tf.math.confusion_matrix(tf.constant(y_test.Spam), y_test_pred)

# Tokenization and Stop Word Removal

In [None]:
sentence = 'Go until jurong point, crazy.. Available only in bugis n great world'
sentence.split()

In [None]:
!pip install stanza  # StanfordNLP has become https://github.com/stanfordnlp/stanza/

In [None]:
import stanza

In [None]:
en = stanza.download('en') 

In [None]:
en = stanza.Pipeline(lang='en')

In [None]:
sentence

In [None]:
tokenized = en(sentence)

In [None]:
len(tokenized.sentences)

In [None]:
for snt in tokenized.sentences:
  for word in snt.tokens:
    print(word.text)
  print("<End of Sentence>")

## Dependency Parsing Example

In [None]:
en2 = stanza.Pipeline(lang='en')
pr2 = en2("Hari went to school")
for snt in pr2.sentences:
  for word in snt.tokens:
    print(word)
  print("<End of Sentence>")

## Japanese Tokenization Example

In [None]:
jp = stanza.download('ja') 

In [None]:
jp = stanza.Pipeline(lang='ja')

In [None]:
jp_line = jp("選挙管理委員会")

In [None]:
for snt in jp_line.sentences:
  for word in snt.tokens:
    print(word.text)

# Adding Word Count Feature 

In [None]:
def word_counts(x, pipeline=en):
  doc = pipeline(x)
  count = sum( [ len(sentence.tokens) for sentence in doc.sentences] )
  return count


In [None]:
#en = snlp.Pipeline(lang='en', processors='tokenize')
df['Words'] = df['Message'].apply(word_counts)

In [None]:
df.describe()

In [None]:
#train=df.sample(frac=0.8,random_state=42) #random state is a seed value
#test=df.drop(train.index)

train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)


In [None]:
x_train = train[['Length', 'Punctuation', 'Capitals', 'Words']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals' , 'Words']]
y_test = test[['Spam']]

model = make_model(input_dims=4)


In [None]:
model.fit(x_train, y_train, epochs=10, batch_size=10)

In [None]:
model.evaluate(x_test, y_test)

## Stop Word Removal

In [None]:
!pip install stopwordsiso

In [None]:
import stopwordsiso as stopwords

stopwords.langs()

In [None]:
sorted(stopwords.stopwords('en'))

In [None]:
en_sw = stopwords.stopwords('en')

def word_counts(x, pipeline=en):
  doc = pipeline(x)
  count = 0
  for sentence in doc.sentences:
    for token in sentence.tokens:
        if token.text.lower() not in en_sw:
          count += 1
  return count

In [None]:
train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)

In [None]:
x_train = train[['Length', 'Punctuation', 'Capitals', 'Words']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals' , 'Words']]
y_test = test[['Spam']]

model = make_model(input_dims=4)
#model = make_model(input_dims=3)

model.fit(x_train, y_train, epochs=10, batch_size=10)

## POS Based Features

In [None]:
en = stanza.Pipeline(lang='en')

txt = "Yo you around? A friend of mine's lookin."
pos = en(txt)

In [None]:
def print_pos(doc):
    text = ""
    for sentence in doc.sentences:
        for token in sentence.tokens:
            text += token.words[0].text + "/" + \
                    token.words[0].upos + " "
        text += "\n"
    return text

In [None]:
print(print_pos(pos))

In [None]:
en_sw = stopwords.stopwords('en')

def word_counts_v3(x, pipeline=en):
  doc = pipeline(x)
  count = 0
  for sentence in doc.sentences:
    for token in sentence.tokens:
        if token.text.lower() not in en_sw and \
        token.words[0].upos not in ['PUNCT', 'SYM']:
          count += 1
  return count

In [None]:
print(word_counts(txt), word_counts_v3(txt))

In [None]:
train['Test'] = 0
train.describe()

In [None]:
def word_counts_v3(x, pipeline=en):
  doc = pipeline(x)
  totals = 0.
  count = 0.
  non_word = 0.
  for sentence in doc.sentences:
    totals += len(sentence.tokens)  # (1)
    for token in sentence.tokens:
        if token.text.lower() not in en_sw:
          if token.words[0].upos not in ['PUNCT', 'SYM']:
            count += 1.
          else:
            non_word += 1.
  non_word = non_word / totals
  return pd.Series([count, non_word], index=['Words_NoPunct', 'Punct'])

In [None]:
x = train[:10]
x.describe()

In [None]:
train_tmp = train['Message'].apply(word_counts_v3)
train = pd.concat([train, train_tmp], axis=1)
train.describe()

In [None]:
test_tmp = test['Message'].apply(word_counts_v3)
test = pd.concat([test, test_tmp], axis=1)
test.describe()

In [None]:
z = pd.concat([x, train_tmp], axis=1)
z.describe()

In [None]:
z.loc[z['Spam']==0].describe()

In [None]:
z.loc[z['Spam']==1].describe()

In [None]:
aa = [word_counts_v3(y) for y in x['Message']]

In [None]:
ab = pd.DataFrame(aa)
ab.describe()

# Lemmatization

In [None]:

text = "Stemming is aimed at reducing vocabulary and aid un-derstanding of" +\
       " morphological processes. This helps people un-derstand the" +\
       " morphology of words and reduce size of corpus."

lemma = en(text)

In [None]:
lemmas = ""
for sentence in lemma.sentences:
        for token in sentence.tokens:
            lemmas += token.words[0].lemma +"/" + \
                    token.words[0].upos + " "
        lemmas += "\n"

print(lemmas)

# TF-IDF Based Model


In [None]:
# if not installed already
!pip install sklearn

In [None]:
corpus = [
          "I like fruits. Fruits like bananas",
          "I love bananas but eat an apple",
          "An apple a day keeps the doctor away"
]


## Count Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names()

In [None]:
X.toarray()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(X.toarray())

In [None]:
query = vectorizer.transform(["apple and bananas"])

cosine_similarity(X, query)

## TF-IDF Vectorization

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X.toarray())

pd.DataFrame(tfidf.toarray(), 
             columns=vectorizer.get_feature_names())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

tfidf = TfidfVectorizer(binary=True)
X = tfidf.fit_transform(train['Message']).astype('float32')
X_test = tfidf.transform(test['Message']).astype('float32')

In [None]:
X.shape

In [None]:
from keras.utils import np_utils

_, cols = X.shape
model2 = make_model(cols)  # to match tf-idf dimensions
lb = LabelEncoder()
y = lb.fit_transform(y_train)
dummy_y_train = np_utils.to_categorical(y)
model2.fit(X.toarray(), y_train, epochs=10, batch_size=10)

In [None]:
model2.evaluate(X_test.toarray(), y_test)

In [None]:
train.loc[train.Spam == 1].describe() 

# Word Vectors

In [None]:
# memory limit may be exceeded. Try deleting some objects before running this next section
# or copy this section to a different notebook.
!pip install gensim

In [None]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api


In [None]:
api.info()

__deprecated__
```python
model_w2v = api.load"word2vec-google-news-300")
```
__runtime error__
```shell
[=========-----------------------------------------] 18.0% 299.5/1662.8MB downloaded
---------------------------------------------------------------------------
ConnectionResetError                      Traceback (most recent call last)
<ipython-input-135-1bce0bfee7a7> in <module>()
----> 1 model_w2v = api.load("word2vec-google-news-300")

7 frames
/usr/lib/python3.7/ssl.py in read(self, len, buffer)
    927         try:
    928             if buffer is not None:
--> 929                 return self._sslobj.read(len, buffer)
    930             else:
    931                 return self._sslobj.read(len)

ConnectionResetError: [Errno 104] Connection reset by peer
```

* [bhaettasch/gensim_word2vec_demo.py](https://gist.github.com/bhaettasch/d7f4e22e79df3c8b6c20)

__growupboron__ comment on Aug 24, 2020
Since this example this deprecated, I created a [Google Colab demo](https://colab.research.google.com/drive/1aLNhDu1qtQnNIvN5Hhl0UhNBBe84N9S4?usp=sharing) for the same.

In [None]:
# download and extract the Google News Dataset
!wget --load-cookies /tmp/cookies.txt "https://drive.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://drive.google.com/uc?export=download&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM" -O GoogleNews-vectors-negative300.bin.gz && rm -rf /tmp/cookies.txt
!gunzip GoogleNews-vectors-negative300.bin.gz

In [None]:
import gensim

# Load pretrained model (since intermediate data is not included, the model cannot be refined with additional data)
#model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, norm_only=True) -> Deprecated
model_w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) #without *norm_only* param


In [None]:
model_w2v.most_similar("cookies",topn=10)

In [None]:
model_w2v.doesnt_match(["USA","Canada","India","Tokyo"])

In [None]:
king = model_w2v['king']
man = model_w2v['man']
woman = model_w2v['woman']

queen = king - man + woman  
model_w2v.similar_by_vector(queen)