# Spam analysis and classification
Explore text message data and create models to predict if a message is spam or not. 

---

You are currently looking at **version 1.0** of this notebook.

---

In [None]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.sparse import csr_matrix, hstack

### Import data

In [None]:
spam_data = pd.read_csv('spam.csv')
spam_data.head()

In [None]:
spam_data['target'] = np.where(spam_data['target']=='spam', 1, 0)
spam_data.head(10)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], spam_data['target'], random_state=0)
X_train.shape, X_test.shape

### Percentage of spam in the documents

In [None]:
spam_data['target'].mean()*100

### CountVectorizer
Fit the training data `X_train` using a Count Vectorizer with default parameters.

In [None]:
vect = CountVectorizer().fit(X_train)
names = sorted(vect.get_feature_names(), key=lambda x:len(x), reverse=True)
names[:5], len(names)

### Multinomial Naive Bayes classifier
Fit and transform the training data `X_train` using a Count Vectorizer with default parameters.  
Next, fit a multinomial Naive Bayes classifier model with smoothing `alpha=0.1`.   
Find the area under the curve (AUC) score using the transformed test data.

In [None]:
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)
len(vect.get_feature_names()), X_test_vectorized.shape, print(str(X_train_vectorized[0])), X_train[0], y_train[0]

In [None]:
nb = MultinomialNB(alpha=0.1).fit(X_train_vectorized, y_train)
y_predict = nb.predict(X_test_vectorized)
roc_auc_score(y_test, y_predict)

### Tf-idf Vectorizer
Equivalent to CountVectorizer followed by TfidfTransformer:
 - Transform a count matrix to a normalized tf or tf-idf representation

Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency.  
This is a common term weighting scheme in information retrieval, that has also found good use in document classification.

The goal of using tf-idf instead of the raw frequencies of occurrence of a token in a given document is to **scale down the impact of tokens that occur very frequently in a given corpus** and that are hence empirically less informative than features that occur in a small fraction of the training corpus.

 - Fit and transform the training data `X_train` using a Tfidf Vectorizer with default parameters.
 - Get 20 features with smallest tf-idf and the largest tf-idf
 - Put these features in a two series where each series is sorted by tf-idf value and then alphabetically by feature name. 
 - The index of the series should be the feature name, and the data should be the tf-idf.
 - The series of 10 features with smallest tf-idfs should be sorted smallest tfidf first, 
 - the list of 10 features with largest tf-idfs should be sorted largest first. 

In [None]:
vect = TfidfVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
tfidf_values = X_train_vectorized.max(0).toarray()[0]
feature_names = np.array(vect.get_feature_names())
print(X_train_vectorized.max(axis=0))

In [None]:
N = 10
df = pd.DataFrame([feature_names, tfidf_values], index=['feature', 'tf-idf']).T
smallest_tfidfs = df.sort_values(by=['tf-idf', 'feature'], ascending=[1, 1]).set_index('feature').head(N)
largest_tfidfs = df.sort_values(by=['tf-idf', 'feature'], ascending=[0, 1]).set_index('feature').head(N)
smallest_tfidfs, '\n', largest_tfidfs

### MultinomialNB 
with Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than **N**
 - fit a multinomial Naive Bayes classifier model with smoothing `alpha=0.1`
 - compute the area under the curve (AUC) score using the transformed test data

In [None]:
N = 3
vect = TfidfVectorizer(min_df=N).fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

nb = MultinomialNB(alpha=0.1).fit(X_train_vectorized, y_train)
y_predict = nb.predict(X_test_vectorized)
roc_auc_score(y_test, y_predict)

### Average length of documents (# of chars) for ham and spam documents


In [None]:
spam_index = spam_data['target'] == 1
spam_data['length'] = [len(x) for x in spam_data['text']]
spam_data.loc[spam_index, 'length'].describe()

In [None]:
spam_data.loc[~spam_index, 'length'].describe()

In [None]:
spam_data.loc[~spam_index, 'length'].describe() / spam_data.loc[spam_index, 'length'].describe()

#### Add feature function

In [None]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    csr = scipy Compressed Sparse Row matrix
    """
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

### SVC 
with Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than **N**.
 - add an additional feature, **the length of document (number of characters)** to the document-term matrix 
 - fit a Support Vector Classification model with regularization `C=10000`
 - compute the area under the curve (AUC) score using the transformed test data.

In [None]:
N = 5
vect = TfidfVectorizer(min_df=N).fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

doc_len_train = [len(x) for x in X_train]
doc_len_test = [len(x) for x in X_test]

X_train_vectorized = add_feature(X_train_vectorized, doc_len_train)
X_test_vectorized = add_feature(X_test_vectorized, doc_len_test)

fit = SVC(C=10000).fit(X_train_vectorized, y_train)
y_pred = fit.predict(X_test_vectorized)
roc_auc_score(y_test, y_pred)

### Average number of Digits per document for ham and spam documents

In [None]:
spam_index = spam_data['target'] == 1
spam_data['digits'] = [len(''.join(re.findall('\d+', x))) for x in spam_data['text']]

In [None]:
spam_data.loc[~spam_index, 'digits'].describe()

In [None]:
spam_data.loc[spam_index, 'digits'].describe()

### Logistic Regression model with regularization
- fit and transform the training data `X_train` using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than **5**
- using **word n-grams from n=1 to n=3** (unigrams, bigrams, and trigrams).

Using this document-term matrix and the following additional features:
* the length of document (number of characters)
* **number of digits per document**


1. fit a Logistic Regression model with regularization `C=100`
2. compute the area under the curve (AUC) score using the transformed test data.

In [None]:
N = 5
vect = TfidfVectorizer(min_df=N, ngram_range=(1, 3)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

# add feature: # of chars
doc_len_train = [len(x) for x in X_train]
doc_len_test = [len(x) for x in X_test]

X_train_vectorized = add_feature(X_train_vectorized, doc_len_train)
X_test_vectorized = add_feature(X_test_vectorized, doc_len_test)

# add feature: # of digits
dig_len_train = [len(''.join(re.findall('\d+', x))) for x in X_train]
dig_len_test = [len(''.join(re.findall('\d+', x))) for x in X_test]

X_train_vectorized = add_feature(X_train_vectorized, dig_len_train)
X_test_vectorized = add_feature(X_test_vectorized, dig_len_test)

# fit & predict model
lr = LogisticRegression(C=100).fit(X_train_vectorized, y_train)
y_pred = lr.predict(X_test_vectorized)
roc_auc_score(y_test, y_pred)

### Average # of non-word characters 
anything other than a letter, digit or underscore

In [None]:
spam_index = spam_data['target'] == 1
spam_data['len_'] = [len(''.join(re.findall('\W+', x))) for x in spam_data['text']]

In [None]:
spam_data.loc[~spam_index, 'len_'].describe()

In [None]:
spam_data.loc[spam_index, 'len_'].describe()

### Logistic Regression model with regularization

Fit and transform the training data X_train using a Count Vectorizer 
 - ignoring terms that have a document frequency strictly lower than **5** 
 - using **character n-grams from n=2 to n=5.**
 - use character n-grams pass in `analyzer='char_wb'` = more robust to spelling mistakes

Using this document-term matrix and the following additional features:
* the length of document (number of characters)
* number of digits per document
* number of non-word characters
['length_of_doc', 'digit_count', 'non_word_char_count']


1. fit a Logistic Regression model with regularization C=100. 
2. compute the area under the curve (AUC) score using the transformed test data.
3. find the 10 smallest and 10 largest coefficients from the model

In [None]:
features_to_add = ['length_of_doc', 'digit_count', 'non_word_char_count']

vect = CountVectorizer(min_df=6, ngram_range=(2, 5), analyzer='char_wb').fit(X_train)
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

# Train
ftrs = pd.DataFrame([(len(x), 
                      len(''.join(re.findall('\d+', x))), 
                      len(''.join(re.findall('\W+', x)))) 
                      for x in X_train], columns=features_to_add)

X_train_vectorized = add_feature(X_train_vectorized, [ftrs.loc[:, 'length_of_doc'],
                                                      ftrs.loc[:, 'digit_count'], 
                                                      ftrs.loc[:, 'non_word_char_count']])

# Test
ftrs = pd.DataFrame([(len(x), 
                      len(''.join(re.findall('\d+', x))), 
                      len(''.join(re.findall('\W+', x)))) 
                      for x in X_test], columns=features_to_add)

X_test_vectorized = add_feature(X_test_vectorized, [ftrs.loc[:, 'length_of_doc'],
                                                    ftrs.loc[:, 'digit_count'], 
                                                    ftrs.loc[:, 'non_word_char_count']])

In [None]:
# Fit 
lr = LogisticRegression(C=100).fit(X_train_vectorized, y_train)
y_predict = lr.predict(X_test_vectorized)
roc_auc_score(y_test, y_predict)

In [None]:
features = vect.get_feature_names() + ['length_of_doc', 'digit_count', 'non_word_char_count']
df = pd.DataFrame([lr.coef_[0].argsort(), features]).T.set_index(0)

In [None]:
df.iloc[sorted_coef_index[:10],]

In [None]:
df.iloc[sorted_coef_index[:-11:-1],]