In [1]:
import numpy as np
import pandas as pd

spam_data = pd.read_csv("D:\Python\data\spam.csv")

spam_data['target'] = np.where(spam_data['target'] == 'spam', 1, 0)
spam_data.head()

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test =train_test_split(spam_data['text'], spam_data['target'], random_state=0)

print('Percentage of the documents in spam_data that are spam: ', spam_data['target'].mean()*100)

Percentage of the documents in spam_data that are spam:  13.406317300789663


In [3]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer().fit(X_train)

print('The longest token in the vocabulary:', sorted(count_vect.get_feature_names_out(), key=len, reverse=True)[0])

The longest token in the vocabulary: com1win150ppmx3age16subscription


In [4]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = count_vect.transform(X_train)

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB(alpha=0.1).fit(X_train_vectorized, y_train)
predictions = model.predict(count_vect.transform(X_test))
#y_proba1 = model.predict_proba(count_vect.transform(X_test))[:,1]
from sklearn.metrics import roc_auc_score
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.9720812182741116


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer().fit(X_train)

X_train_vectorized_tfidf = tfidf_vect.transform(X_train)
X_train_vectorized_tfidf

<4179x7354 sparse matrix of type '<class 'numpy.float64'>'
	with 55130 stored elements in Compressed Sparse Row format>

In [6]:
# find the max tf-idf value for every feature
feature_max_tfidf = X_train_vectorized_tfidf.max(0).toarray()[0]
feature_max_tfidf

array([0.23731416, 0.36138463, 0.22862438, ..., 0.27814246, 0.36296157,
       0.3487995 ])

In [7]:
feature_names = tfidf_vect.get_feature_names_out()

max_tfdif = pd.Series(feature_max_tfidf, index=feature_names)

print('20 features with the smallest max tf-idf:\n{}\n'.format(max_tfdif.sort_index().sort_values(kind='stable')[:20]))

print('20 features with the largest max tf-idf:\n{}\n'.format(max_tfdif.sort_index(ascending=False).sort_values(kind='stable')[:-21:-1]))

20 features with the smallest max tf-idf:
aaniye          0.074475
athletic        0.074475
chef            0.074475
companion       0.074475
courageous      0.074475
dependable      0.074475
determined      0.074475
exterminator    0.074475
healer          0.074475
listener        0.074475
organizer       0.074475
pest            0.074475
psychiatrist    0.074475
psychologist    0.074475
pudunga         0.074475
stylist         0.074475
sympathetic     0.074475
venaam          0.074475
afternoons      0.091250
approaching     0.091250
dtype: float64

20 features with the largest max tf-idf:
146tf150p    1.000000
645          1.000000
anything     1.000000
anytime      1.000000
beerage      1.000000
done         1.000000
er           1.000000
havent       1.000000
home         1.000000
lei          1.000000
nite         1.000000
ok           1.000000
okie         1.000000
thank        1.000000
thanx        1.000000
too          1.000000
where        1.000000
yup          1.000000
tick 

In [8]:
tfidf_vect2 = TfidfVectorizer(min_df=3).fit(X_train)

X_train_vectorized_tfidf2 = tfidf_vect2.transform(X_train)

model = MultinomialNB(alpha=0.1).fit(X_train_vectorized_tfidf2, y_train)
y_proba = model.predict_proba(tfidf_vect2.transform(X_test))[:,1]

print('AUC: ', roc_auc_score(y_test, y_proba))

AUC:  0.9954968337775665


In [9]:
avg_len_not_spam = spam_data['text'][spam_data['target']==0].aggregate(len).mean()

avg_len_spam = spam_data['text'][spam_data['target']==1].aggregate(len).mean()

print('Average length of documents (number of characters) for not spam documents is {:.2f} and for spam documents is {:.2f}.'
      .format(avg_len_not_spam, avg_len_spam))

Average length of documents (number of characters) for not spam documents is 71.02 and for spam documents is 138.87.


In [10]:
# function to combine new features into the training data
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add can also be a list of features.
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [11]:
tfidf_vect3 = TfidfVectorizer(min_df=5).fit(X_train)
X_train_vectorized_tfidf3 = tfidf_vect3.transform(X_train)

X_train_vectorized_tfidf3 = add_feature(X_train_vectorized_tfidf3, X_train.aggregate(len))

from sklearn.svm import SVC
model = SVC(C=10000).fit(X_train_vectorized_tfidf3, y_train)
X_test_vectorized = tfidf_vect3.transform(X_test)
X_test_vectorized = add_feature(X_test_vectorized, X_test.aggregate(len))
y_scores = model.decision_function(X_test_vectorized)

print('AUC: ', roc_auc_score(y_test, y_scores))

AUC:  0.9963202213809143


In [12]:
import re
mean_digits_not_spam = spam_data['text'][spam_data['target']==0].aggregate(lambda x: len(re.findall('\d',x))).mean()
mean_digits_spam = spam_data['text'][spam_data['target']==1].aggregate(lambda x: len(re.findall('\d',x))).mean()

print('The average number of digits per document for not spam documents is: {:.2f} and for spam documents is: {:.2f}'
      .format(mean_digits_not_spam, mean_digits_spam))

The average number of digits per document for not spam documents is: 0.30 and for spam documents is: 15.76


In [13]:
tfidf_vect4 = TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train)
X_train_vectorized_tfidf4 = tfidf_vect4.transform(X_train)

X_train_number_of_digits = X_train.aggregate(lambda x: len(re.findall('\d', x)))
X_train_number_of_digits.name = 'digit_count'
X_train_doc_len = X_train.aggregate(len)
X_train_doc_len.name = 'length_of_doc'

X_train_vectorized_tfidf4 = add_feature(X_train_vectorized_tfidf4, [X_train_number_of_digits, X_train_doc_len])

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=100, max_iter=1000).fit(X_train_vectorized_tfidf4, y_train)

X_test_vectorized_tfidf4 = tfidf_vect4.transform(X_test)
X_test_number_of_digits = X_test.aggregate(lambda x: len(re.findall('\d', x)))
X_test_doc_len = X_test.aggregate(len)
X_test_vectorized_tfidf4 = add_feature(X_test_vectorized_tfidf4, [X_test_number_of_digits, X_test_doc_len])
y_proba4 = model.predict_proba(X_test_vectorized_tfidf4)[:,1]

print('AUC: ', roc_auc_score(y_test, y_proba4))

AUC:  0.9972964025601413


In [14]:
avg_non_words_not_spam = spam_data['text'][spam_data['target']==0].agg(lambda x: len(re.findall('\W', x))).mean()
avg_non_words_spam = spam_data['text'][spam_data['target']==1].agg(lambda x: len(re.findall('\W', x))).mean()

print('The average number of non-word characters (anything other than a letter, digit or underscore) per document for not spam documents is: {:.2f} and for spam documents is: {:.2f} '
      .format(avg_non_words_not_spam, avg_non_words_spam))

The average number of non-word characters (anything other than a letter, digit or underscore) per document for not spam documents is: 17.29 and for spam documents is: 29.04 


In [15]:
count_vect2 = CountVectorizer(min_df=5, ngram_range=(2,5), analyzer='char_wb').fit(X_train.iloc[:2000])
X_train_count_vectorized2 = count_vect2.transform(X_train.iloc[:2000])

X_train_non_words = X_train.agg(lambda x: len(re.findall('\W',x)))
X_train_non_words.name = 'non_word_char_count'

X_train_count_vectorized2 = add_feature(X_train_count_vectorized2, [X_train_doc_len.iloc[:2000],
                                                                    X_train_number_of_digits.iloc[:2000],
                                                                    X_train_non_words.iloc[:2000]])

model = LogisticRegression(C=100,max_iter=1000).fit(X_train_count_vectorized2, y_train.iloc[:2000])

X_test_count_vectorized2 = count_vect2.transform(X_test)
X_test_non_words = X_test.agg(lambda x: len(re.findall('\W', x)))
X_test_count_vectorized2 = add_feature(X_test_count_vectorized2, [X_test_doc_len,
                                                                  X_test_number_of_digits,
                                                                  X_test_non_words])
y_proba_count2 = model.predict_proba(X_test_count_vectorized2)[:,1]
print('AUC: ', roc_auc_score(y_test, y_proba_count2))

AUC:  0.9975637913179294


In [16]:
feature_names_count2 = count_vect2.get_feature_names_out()
feature_names_count2 = np.append(feature_names_count2, ['length_of_doc', 'digit_count', 'non_word_char_count'])
feature_names_count2

coef_count_2 = pd.Series(model.coef_[0], index=feature_names_count2)

print('10 smallest coefficients from the model:\n{}\n'
      .format(coef_count_2.sort_index().sort_values(kind='stable').index[:10].tolist()))

print('10 largest coefficients from the model:\n{}'
      .format(coef_count_2.sort_index(ascending=False).sort_values(kind='stable').index[:-11:-1].tolist()))

10 smallest coefficients from the model:
['n ', ' i', 'at', 'he', ' m', '..', 'us', 'go', ' lo', ' bu']

10 largest coefficients from the model:
['digit_count', 'ne', ' st', 'co', 's ', 'xt', 'lt', 'xt ', ' ne', 'der']
