In [1]:
# understand matrix structure of vectorizer
from sklearn.feature_extraction.text import CountVectorizer

words = ['one two three','one four five','two three six']
vectorizer = CountVectorizer()

vectorizer.fit(words)

print(vectorizer.vocabulary_)

{'one': 2, 'two': 5, 'three': 4, 'four': 1, 'five': 0, 'six': 3}


In [2]:
sample = vectorizer.transform([words[0]])

print(sample.toarray())

[[0 0 1 0 1 1]]


In [3]:
from os import listdir, getcwd, chdir
from os.path import isfile, join, dirname, realpath
import pandas as pd

def get_cwd():
    try:
        chdir(dirname(realpath(__file__)))
    except:
        chdir('D:\Projects\MSDS-7333-QTW')

    active_dir = getcwd()
       
    return active_dir

def main():
    
    get_cwd()
    
    directories = [
            'easy_ham',
            'easy_ham_2',
            'hard_ham',
            'spam',
            'spam_2'
        ]
    
    res_frame = pd.DataFrame()

    # *dc - Added to keep a collection of email text
    emails = []
        
    for d in directories:
        mypath = getcwd() + '/SpamAssassinMessages/' + d + '/'
        onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    
        try:
            onlyfiles.remove('.DS_Store')
        except:
            pass
        
        for file in onlyfiles:
            with open(mypath + file, encoding='latin1') as f:
                lines = f.readlines()
                f.close()
                
            in_reply_count = 0
            sub_line_all_caps = 0
            attachments = 0
            subject_line = []
            n_lines = 0
            blank_lines = []
            
            for line in lines:

                n_lines += 1
                if "Subject: Re: " in line:
                   in_reply_count += 1
                if "Subject: " in line:
                   s_line = line.strip().replace('Subject: ','')
                   s_line = ''.join(e for e in s_line if e.isalnum())
                   num_upper = sum(1 for c in s_line if c.isupper())
                   ttl_chars = len(s_line)
                   if num_upper == ttl_chars:
                       sub_line_all_caps += 1
                   subject_line.append(s_line)
                if "content-type: multipart" in line.lower():
                   attachments += 1
                if line == "\n":
                   blank_lines.append(n_lines)
        
            temp_frame = pd.DataFrame({
                        'directory':d,    
                        'filename':file,
                        'is_spam':['Y' if 'spam' in d else 'N'],
                        'in_reply': ['Y' if in_reply_count > 0 else 'N'], 
                        'subj_caps': ['Y' if sub_line_all_caps > 0 else 'N'], 
                        'attachments': ['Y' if attachments > 0 else 'N'],
                        ## *dc+3 
                        #'body_lines': n_lines - min(blank_lines)
                        'body_lines': [0 if len(blank_lines) == 0 else min(blank_lines)]
                        }, index=[0])
           
            res_frame = res_frame.append(temp_frame, ignore_index=True)

            ## *dc+2
            # append body of email to collection
            text = ' '.join(lines)
            emails.append(text)
            
    #res_frame.to_csv('output_file.csv', index=False)
    
    ## *dc - add emails
    return res_frame, emails

## *dc - Working from a notebook instead of py file.
df, emails = main()
# ########################################
# ##### Main Function
# ########################################    
# if __name__ == "__main__":
#     res_frame, emails = main()
#     pass   

In [4]:
print(len(df),len(emails))

print(emails[0])

9353 9353
From exmh-workers-admin@redhat.com  Thu Aug 22 12:36:23 2002
 Return-Path: <exmh-workers-admin@spamassassin.taint.org>
 Delivered-To: zzzz@localhost.netnoteinc.com
 Received: from localhost (localhost [127.0.0.1])
 	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36
 	for <zzzz@localhost>; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)
 Received: from phobos [127.0.0.1]
 	by localhost with IMAP (fetchmail-5.9.0)
 	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:36:16 +0100 (IST)
 Received: from listman.spamassassin.taint.org (listman.spamassassin.taint.org [66.187.233.211]) by
     dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MBYrZ04811 for
     <zzzz-exmh@spamassassin.taint.org>; Thu, 22 Aug 2002 12:34:53 +0100
 Received: from listman.spamassassin.taint.org (localhost.localdomain [127.0.0.1]) by
     listman.redhat.com (Postfix) with ESMTP id 8386540858; Thu, 22 Aug 2002
     07:35:02 -0400 (EDT)
 Delivered-To: exmh-workers@listman.spamassassin.taint.org
 R

In [5]:
import nltk
import re 
import numpy as np

stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lowercase and remove special characters to form a normalized document
    doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()

    # tokenize document
    tokens = nltk.word_tokenize(doc)
    
    # filter out stop words
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Remove numbers
    filtered_tokens = [token for token in filtered_tokens if not token.isdigit()]

    # Remove short tokens
    filtered_tokens = [token for token in filtered_tokens if len(token) > 2]

    # stem tokens - Skipping for now
    #filtered_tokens = [stemming.stem(token) for token in filtered_tokens]

    # re-create a normalized document
    doc = ' '.join(filtered_tokens)
    return doc

normalize_text = np.vectorize(normalize_document)
norm_text = normalize_text(emails)

print(type(norm_text),len(norm_text))

<class 'numpy.ndarray'> 9353


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_df=.8, stop_words=stop_words, norm='l2')
tf_matrix = tf.fit_transform(norm_text)

print(tf_matrix.shape)

(9353, 166742)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0, max_df=1., stop_words=stop_words)
cv_matrix = cv.fit_transform(norm_text)

print(cv_matrix.shape)

In [7]:
print(df.columns)

Index(['directory', 'filename', 'is_spam', 'in_reply', 'subj_caps',
       'attachments', 'body_lines'],
      dtype='object')


In [12]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

NUM_CLUSTERS = 5
km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=50, random_state=42).fit(tf_matrix)
km

df['kmeans_cluster'] = km.labels_

email_clusters = (df[['directory', 'kmeans_cluster']]
                  .sort_values(by=['kmeans_cluster'], 
                               ascending=False)
                  .groupby('kmeans_cluster').head(20))  # top 20 movies for each cluster
email_clusters = email_clusters.copy(deep=True)

feature_names = tf.get_feature_names()
topn_features = 50
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]

sample_silhouette_values = silhouette_samples(tf_matrix, km.labels_)

# get key features for each cluster
for cluster_num in range(NUM_CLUSTERS):

    cluster_silhouette_values = sample_silhouette_values[km.labels_ == cluster_num]

    key_features = [feature_names[index] 
                        for index in ordered_centroids[cluster_num, :topn_features]]
    print('CLUSTER #'+str(cluster_num+1), ":", cluster_silhouette_values.mean())
    print('Cluster Size', cluster_silhouette_values.shape[0])
    print('Key Features:', key_features)
    print('-'*80)

CLUSTER #1 : 0.2410520400548465
Cluster Size 771
Key Features: ['sourceforge net', 'sourceforge', 'net', 'razor', 'razor users', 'example sourceforge net', 'example sourceforge', 'spamassassin talk', 'talk', 'lists', 'spamassassin', 'lists sourceforge net', 'lists sourceforge', 'usw', 'users', 'example', 'spamassassin devel', 'devel', 'sourceforge net subject', 'aug', 'list', 'sf', 'list1 sourceforge', 'list1 sourceforge net', 'list1', 'admin example sourceforge', 'usw list1 sourceforge', 'usw list1', 'sourceforge net lists', 'net lists', 'lists listinfo', 'net lists listinfo', 'https', 'admin example', 'mailto spamassassin', 'net subject', 'net usw', 'talk admin', 'mailto spamassassin talk', 'talk example', 'talk example sourceforge', 'spamassassin talk admin', 'received usw', 'sourceforge net usw', 'thu', 'spamassassin talk example', 'list2 sourceforge', 'list2', 'list2 sourceforge net', 'usw list2 sourceforge']
------------------------------------------------------------------------

In [13]:
## Output a quick pivot table to see distribution of clusters vs spam/ham

df.pivot_table(index='kmeans_cluster', columns='is_spam', values='directory', aggfunc='count')

is_spam,N,Y
kmeans_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,748.0,23.0
1,3158.0,2214.0
2,692.0,110.0
3,1698.0,52.0
4,658.0,
