---

_You are currently looking at **version 1.1** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-text-mining/resources/d9pwm) course resource._

---

# Predicting Spam Messages


In [None]:
import pandas as pd
import numpy as np

spam_data = pd.read_csv('spam.csv')

spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(10)

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(spam_data['text'], 
                                                    spam_data['target'], 
                                                    random_state=0)


What percentage of the documents in `spam_data` are spam?


In [None]:
def answer_one():
    spam = len(spam_data[spam_data.target == 1])
    total = len(spam_data)
    result = spam/total*100
    return result

In [None]:
answer_one()


Fit and transform the training data `X_train` using a Count Vectorizer with default parameters.

Next,we will fit a fit a multinomial Naive Bayes classifier model with smoothing `alpha=0.1`. Find the area under the curve (AUC) score using the transformed test data.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

def answer_three():
    vect = CountVectorizer().fit(X_train)
    X_train_vectorized = vect.transform(X_train)
    model = MultinomialNB( alpha=0.1).fit(X_train_vectorized,y_train)
    y_pred = model.predict(vect.transform(X_test))
    auc = roc_auc_score(y_test, y_pred)
    return auc

In [None]:
answer_three()


Fit and transform the training data `X_train` using a Tfidf Vectorizer with default parameters.

What 20 features have the smallest tf-idf and what 20 have the largest tf-idf?


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def answer_four():
    vect = TfidfVectorizer().fit(X_train)
    features = np.array(vect.get_feature_names())
    X_train_tf = vect.transform(X_train)
    max_tf_idfs = X_train_tf.max(0).toarray()[0] # Get largest tfidf values across all documents.
    sorted_tf_idxs = max_tf_idfs.argsort() # Sorted indices
    sorted_tf_idfs = max_tf_idfs[sorted_tf_idxs] # Sorted TFIDF values
    smallest_tf_idfs = pd.Series(sorted_tf_idfs[:20], index=features[sorted_tf_idxs[:20]])                    
    largest_tf_idfs = pd.Series(sorted_tf_idfs[-20:][::-1], index=features[sorted_tf_idxs[-20:][::-1]])
   
    return (smallest_tf_idfs, largest_tf_idfs)

In [None]:
answer_four()



Fit and transform the training data `X_train` using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than **3**.

Then fit a multinomial Naive Bayes classifier model with smoothing `alpha=0.1` and compute the area under the curve (AUC) score using the transformed test data.



In [None]:
def answer_five():
    vect = TfidfVectorizer(min_df=3).fit(X_train)
    X_train_vectorized = vect.transform(X_train)
    model = MultinomialNB( alpha=0.1).fit(X_train_vectorized,y_train)
    y_pred = model.predict(vect.transform(X_test))
    auc = roc_auc_score(y_test, y_pred)
    return auc

In [None]:
answer_five()



What is the average length of documents (number of characters) for not spam and spam documents?


In [None]:
def answer_six():
    spam = spam_data[spam_data['target'] == 1]
    not_spam = spam_data[spam_data['target'] == 0]
    spam_lengths = [len(x) for x in spam['text']]
    not_spam_lengths = [len(x) for x in not_spam['text']]
    spam_avg = sum(spam_lengths)/len(spam_lengths)
    not_spam_avg = sum(not_spam_lengths)/len(not_spam_lengths)
    
    return (not_spam_avg,spam_avg)

In [None]:
answer_six()

In [None]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [None]:
from sklearn.svm import SVC

def answer_seven():
    len_train = [len(x) for x in X_train]
    len_test = [len(x) for x in X_test]
    
    tf = TfidfVectorizer(min_df=5).fit(X_train)
    X_train_tf = tf.transform(X_train)
    X_test_tf = tf.transform(X_test)
    
    X_train_tf = add_feature(X_train_tf, len_train)
    X_test_tf = add_feature(X_test_tf, len_test)
    
    clf = SVC(C=10000)
    clf.fit(X_train_tf, y_train)
    pred = clf.predict(X_test_tf)
    
    return roc_auc_score(y_test, pred)


In [None]:
answer_seven()



Fit and transform the training data `X_train` using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than **5** and using **word n-grams from n=1 to n=3** (unigrams, bigrams, and trigrams).


fit a Logistic Regression model with regularization `C=100`. Then compute the area under the curve (AUC) score using the transformed test data.



In [None]:
from sklearn.linear_model import LogisticRegression

def answer_nine():
    len_train = [len(x) for x in X_train]
    len_test = [len(x) for x in X_test]
    dig_train = [sum(char.isnumeric() for char in x) for x in X_train]
    dig_test = [sum(char.isnumeric() for char in x) for x in X_test]
    
    tf = TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train)
    X_train_tf = tf.transform(X_train)
    X_test_tf = tf.transform(X_test)
    
    X_train_tf = add_feature(X_train_tf, len_train)
    X_test_tf = add_feature(X_test_tf, len_test)
    X_train_tf = add_feature(X_train_tf, dig_train)
    X_test_tf = add_feature(X_test_tf, dig_test)
    
    model = LogisticRegression(C=100).fit(X_train_tf,y_train)
    y_pred = model.predict(X_test_tf)
    auc = roc_auc_score(y_test,y_pred)
    
    return auc

In [None]:
answer_nine()



Fit and transform the training data X_train using a Count Vectorizer ignoring terms that have a document frequency strictly lower than **5** and using **character n-grams from n=2 to n=5.**

To tell Count Vectorizer to use character n-grams pass in `analyzer='char_wb'` which creates character n-grams only from text inside word boundaries. This should make the model more robust to spelling mistakes.

fit a Logistic Regression model with regularization C=100. Then compute the area under the curve (AUC) score using the transformed test data.


In [None]:
def answer_eleven():
    len_train = [len(x) for x in X_train]
    len_test = [len(x) for x in X_test]
    dig_train = [sum(char.isnumeric() for char in x) for x in X_train]
    dig_test = [sum(char.isnumeric() for char in x) for x in X_test]
    nan_train = X_train.str.count('\W')
    nan_test = X_test.str.count('\W')
    
    cv = CountVectorizer(min_df=5, ngram_range=(2,5), analyzer='char_wb').fit(X_train)
    X_train_tf = cv.transform(X_train)
    X_test_tf = cv.transform(X_test)
    
    X_train_tf = add_feature(X_train_tf, len_train)
    X_test_tf = add_feature(X_test_tf, len_test)
    X_train_tf = add_feature(X_train_tf, dig_train)
    X_test_tf = add_feature(X_test_tf, dig_test)
    X_train_tf = add_feature(X_train_tf, nan_train)
    X_test_tf = add_feature(X_test_tf, nan_test)
    
    model = LogisticRegression(C=100).fit(X_train_tf,y_train)
    y_pred = model.predict(X_test_tf)
    auc = roc_auc_score(y_test,y_pred)
    
    feature_names = np.array(cv.get_feature_names() + ['length_of_doc', 'digit_count', 'non_word_char_count'])
    sorted_coef_index = model.coef_[0].argsort()
    small_coeffs = list(feature_names[sorted_coef_index[:10]])
    large_coeffs = list(feature_names[sorted_coef_index[:-11:-1]])
    
    return (auc,small_coeffs,large_coeffs)

In [None]:
answer_eleven()