In [81]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np
import pandas as pd

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

In [82]:
import spacy
import en_core_web_sm
import re
from nltk.tokenize.regexp import RegexpTokenizer

In [83]:
# parse commandline arguments
op = OptionParser()
op.add_option("--lsa",
              dest="n_components", type="int",
              help="Preprocess documents with latent semantic analysis.")
op.add_option("--no-minibatch",
              action="store_false", dest="minibatch", default=True,
              help="Use ordinary k-means algorithm (in batch mode).")
op.add_option("--no-idf",
              action="store_false", dest="use_idf", default=True,
              help="Disable Inverse Document Frequency feature weighting.")
op.add_option("--use-hashing",
              action="store_true", default=False,
              help="Use a hashing feature vectorizer")
op.add_option("--n-features", type=int, default=100000,
              help="Maximum number of features (dimensions)"
                   " to extract from text.")
op.add_option("--verbose",
              action="store_true", dest="verbose", default=False,
              help="Print progress reports inside k-means algorithm.")

print(__doc__)
op.print_help()

Automatically created module for IPython interactive environment
Usage: ipykernel_launcher.py [options]

Options:
  -h, --help            show this help message and exit
  --lsa=N_COMPONENTS    Preprocess documents with latent semantic analysis.
  --no-minibatch        Use ordinary k-means algorithm (in batch mode).
  --no-idf              Disable Inverse Document Frequency feature weighting.
  --use-hashing         Use a hashing feature vectorizer
  --n-features=N_FEATURES
                        Maximum number of features (dimensions) to extract
                        from text.
  --verbose             Print progress reports inside k-means algorithm.


In [84]:
def is_interactive():
    return not hasattr(sys.modules['__main__'], '__file__')


# work-around for Jupyter notebook and IPython console
argv = [] if is_interactive() else sys.argv[1:]
(opts, args) = op.parse_args(argv)
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

In [88]:
df = pd.read_csv('../SmartEmailTracker/emaildataset.csv')
print(df.shape)
df.head()

(951, 7)


Unnamed: 0,From,To,Subject,Body,TransactionID,Payment Date,Class
0,Spike@DEUTSCHEBANK.com,Shaniece@CitiBankPune.com,Transaction no. 072558 is unresolved.,Sorry to inform that there has been only a par...,72558,04-02-2020,Pending
1,Stacy@HDFC.com,Rohan@CitiBankNewYork.com,Order for new Cheque book,"Good morning, I want to place an order for an ...",212096,29-05-2020,General
2,Zachary@HDFC.com,Rishabh@CitiBankPune.com,Required money acquired. Transaction 847047 is...,Hello! This is to inform you that I have recei...,847047,26-01-2020,Processing
3,Stacy@SBI.com,Shai@CitiBankHongKong.com,Asking for the details for transaction 746078,I request you to kindly send the status of my ...,746078,17-06-2019,Request
4,Angela@HDFC.com,Dipesh@CitiBankSingapore.com,Partial payment for transaction 535918,Hello!! Greetings for the day. Status of trans...,535918,18-02-2020,Pending


In [89]:
true_k = df.Class.unique().shape[0]

In [90]:
true_k

6

In [91]:
nlp_ = en_core_web_sm.load()

In [92]:
nlp = spacy.load('en')

In [93]:
my_stop = {"'d", "'ll", "'m", "'re", "'s", "'ve",'a','cc','subject','http', 'gbp', 'usd', 'eur', 'inr', 'cad',
           'thanks', "acc", "id", 'account', 'regards', 'hi', 'hello', 'thank you', 'greetings', 'about','above',
 'across','after','afterwards','against','alone','along','already','also','although','always','am','among',
 'amongst','amount','an','and','another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'further',
 'get',
 'give',
 'go',
 'had',
 'has',
 'have',
 'he',
 'hence',
 'her',
 'here',
 'hereafter',
 'hereby',
 'herein',
 'hereupon',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'however',
 'hundred',
 'i',
 'if',
 'in',
 'indeed',
 'into',
 'is',
 'it',
 'its',
 'itself',
 'just',
 'keep',
 'last',
 'latter',
 'latterly',
 'least',
 'less',
 'made',
 'make',
 'many',
 'may',
 'me',
 'meanwhile',
 'might',
 'mine',
 'more',
 'moreover',
 'most',
 'mostly',
 'move',
 'much',
 'must',
 'my',
 'myself',
 'name',
 'namely',
 'neither',
 'never',
 'nevertheless',
 'next',
 'nine',
 'no',
 'nobody',
 'now',
 'nowhere',
 'of',
 'off',
 'often',
 'on',
 'once',
 'one',
 'only',
 'onto',
 'or',
 'other',
 'others',
 'otherwise',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'part',
 'per',
 'perhaps',
 'please',
 'put',
 'quite',
 'rather',
 're',
 'really',
 'regarding',
 'same',
 'say',
 'see',
 'seem',
 'seemed',
 'seeming',
 'seems',
 'serious',
 'several',
 'she',
 'should',
 'show',
 'side',
 'since',
 'six',
 'sixty',
 'so',
 'some',
 'somehow',
 'someone',
 'something',
 'sometime',
 'sometimes',
 'somewhere',
 'still',
 'such',
 'take',
 'ten',
 'than',
 'that',
 'the',
 'their',
 'them',
 'themselves',
 'then',
 'thence',
 'there',
 'thereafter',
 'thereby',
 'therefore',
 'therein',
 'thereupon',
 'these',
 'they',
 'third',
 'this',
 'those',
 'though',
 'three',
 'through',
 'throughout',
 'thru',
 'thus',
 'to',
 'together',
 'too',
 'top',
 'toward',
 'towards',
 'twelve',
 'twenty',
 'two',
 'under',
 'unless',
 'until',
 'up',
 'upon',
 'us',
 'used',
 'using',
 'various',
 'very',
 'via',
 'was',
 'we',
 'well',
 'were',
 'what',
 'whatever',
 'when',
 'whence',
 'whenever',
 'where',
 'whereafter',
 'whereas',
 'whereby',
 'wherein',
 'whereupon',
 'wherever',
 'whether',
 'which',
 'while',
 'whither',
 'who',
 'whoever',
 'whole',
 'whom',
 'whose',
 'why',
 'will',
 'with',
 'within',
 'would',
 'yet',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 '‘d',
 '‘ll',
 '‘m',
 '‘re',
 '‘s',
 '‘ve',
 '’d',
 '’ll',
 '’m',
 '’re',
 '’s',
 '’ve'}

In [94]:
def clean(text):
    
    text=text.rstrip()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = " ".join([i for i in text.lower().split()])
    print(text)
    
    customize_stop_words = ["cc","subject","http", "gbp", "usd", "eur", "inr", "cad","thanks", "acc", "id", "account", "regards", "hi", "hello", "thank you", "greetings"]
    rem_stop = ["not", "wasn't", "hadn't", "won't", "can't", "didn't"]
    
    for w in customize_stop_words:
        nlp.vocab[w].is_stop = True
    
    for w in rem_stop:
        nlp.vocab[w].is_stop = False
    
    doc = nlp(text)
    
    normalized = " ".join(token.lemma_ for token in doc if not token.is_stop)
    
    doc = " ".join(token.orth_ for token in nlp(normalized) if not token.is_punct | token.is_space)
    return doc

In [95]:
print(clean("payment in id 1234 for amount 14859 GBP has been freezed"))

payment in id for amount gbp has been freezed
payment would freeze


In [96]:
for i in range(df.shape[0]):
    # merge subject and body strings
    df['Text_Data'] = (df['Subject'] + " " + df['Body'])

In [97]:
def converter(x):
    try:
        return ' '.join([x.lower() for x in str(x).split()])
    except AttributeError:
        return None  # or some other value

df['Text_Data'] = df['Text_Data'].apply(converter)

In [98]:
text_clean=[]

for i in range(df.shape[0]):
    text_clean.append(clean(df.loc[i]['Text_Data']))

transaction no is unresolved sorry to inform that there has been only a partial payment of amount gbp you are definitely going to receive the rest at a later point of time warm regards
order for new cheque book good morning i want to place an order for an multicity cheque book for account no having atleat leaves kindly send the same to the address mentioned in my account records
required money acquired transaction is in process hello this is to inform you that i have received the amount you transferred to my account and now it is currently in process
asking for the details for transaction i request you to kindly send the status of my transaction with id thanks and regards
partial payment for transaction hello greetings for the day status of transaction usd for account is pending i would be grateful if you could tell me the cause thanks a lot
payment done and transaction settled greetings i wanted to let you know that i have acknowledged the payment for transaction in response to your e

want to block account hey i think i lost my atm card today so can you please block my account or tell me a way to change my pin no i urgently need your reply thanks in advance
imploring update on transaction can you please tell me the amount transferred through transaction id thanks
urgent transaction ceased this is to notify you that my transaction has failed please reply to me with the cause as soon as possible
transaction stalled and payment not received this is in response to your email stating that my transaction having id has failed but no reason was mentioned can you please tell me what did i do wrong so that i can create a new one without any errors waiting for your reply thank you in advance
how to change pin no of atm card please reply this is very urgent i have to change the pin no for atm card associated with account no since it was stolen from me last night or suggest some other way to keep my money secure
seeking update on the status of transaction i need details of urgen

dealing with the transaction thank you for transferring the payment to my account yesterday i appreciate the quick response i would like to inform you that is is currently in process thanks and regards
impetrating details for id i need details of urgently please provide the same on priority basis
abrupt closure of transaction with id this is to notify you that my transaction has failed please reply to me with the cause as soon as possible
handling the transaction after payment hello this is to inform you that i have received the amount you transferred to my account and now it is currently in process
seeking update on the status of transaction kindly provide a status update on at the earliest
add another contact no to my bank account kindly add the contact no to my bank account and set it as my default no to receive all alerts
soliciting information for id hey i would be really grateful if you could tell me the details of account no thanks
request to send details of transaction it would

handling the transaction after payment hello this is to inform you that i have received the amount you transferred to my account and now it is currently in process
upgrade to an account with more benefits send instructions to setup a premium account i currently have a normal account thanks in advance
the pending amount for transaction will reach you soon since my transaction is still pending i wanted to know if there is a problem with the paperwork from my side please let me know at the earliest thanks and regards
accepted payment transaction currently processing acknowledging the received payment for transaction it is finally in process kind regards
transaction no is unresolved since my transaction is still pending i wanted to know if there is a problem with the paperwork from my side please let me know at the earliest thanks and regards
partially paid the required amount for transaction hey sincere apologies for transferring a fraction of the amount we agreed on please be assured tha

asking for the details for transaction i need details of urgently please provide the same before the coming friday
transaction stalled and payment not received i have been your regular client and have followed the procedure for creating a transaction correctly still i received an email saying that the transaction has failed i would like to know the reasoning my transaction id is regards
received amount now being processed sincere greetings i am glad to tell you that i have finally been granted the pre approved amount through transaction it is now being processed i am grateful for your support warm regards
processing transaction having id hello this is to inform you that i have received the amount you transferred to my account and now it is currently in process
incomplete transaction since my transaction is still pending i wanted to know if there is a problem with the paperwork from my side please let me know at the earliest thanks and regards
handling the transaction after payment so h

processing transaction having id this is in response to your email notifying about the transferred amount i am acknowledging it and informing you that it is now in process
imploring update on transaction i need a status update on asap
transaction has failed greetings for the day i am confused why i have received an email stating the failure of my transaction please help me understand why this happened need funds urgently patiently waiting for you reply thanks a lot
urgently required update on transaction i need details of urgently please provide the same on priority basis
transaction no is unresolved there has been only a partial payment of amount inr assuring you that the rest will be paid later warm regards
urgent transaction ceased this is to notify you that my transaction has failed please reply to me with the cause as soon as possible
want to block account hey i am interested in buying a new citi card please let me know how to go about it thanks in advance
transaction stalled and 

asking for the details for transaction hey i would be really grateful if you could tell me the details of account no thanks
why has my transaction stopped hey i see my transaction with id has failed i think i did everything right please look into this and reply with the reason urgently thank you and regards
transaction stalled and payment not received i made a payment of rs and the money got dedcuted from my account still i received an email saying that the transaction has failed i would like to know the reasoning my transaction id is regards
received full payment for transaction no hey i am writing in reference to the transaction i was granted the aforementioned amount within the deadline sincere gratitude for such a quick response kind regards
abrupt closure of transaction with id greetings for the day i checked my inbox and found your email stating the failure of my transaction please help me understand why has failed need funds urgently patiently waiting for you reply thanks a lot


why cant i withdraw money with my atm card hey can you please send me the detailed steps that i ought to follow to create a new bank account
payment is pending for transaction hello greetings for the day status of transaction inr for account is pending i would be grateful if you could tell me the cause thanks a lot
imploring update on transaction can you please tell me the amount transferred through transaction id thanks
transaction is now being processed sincere greetings i am glad to tell you that i have finally been granted the pre approved amount through transaction it is now being processed i am grateful for your support warm regards
failure of transaction hey i see my transaction with id has failed i think i did everything right please look into this and reply with the reason urgently thank you and regards
processing transaction having id thank you for transferring the payment to my account yesterday i appreciate the quick response i would like to inform you that is is currently 

transaction has failed this is in response to your email stating that my transaction having id has failed but no reason was mentioned can you please tell me what did i do wrong so that i can create a new one without any errors waiting for your reply thank you in advance
request to send details of transaction i request you to kindly send the status of my transaction with id thanks and regards
required money acquired transaction is in process hello this is to inform you that i have received the amount you transferred to my account and now it is currently in process
handling the transaction after payment sincere greetings i am glad to tell you that i have finally been granted the pre approved amount through transaction it is now being processed i am grateful for your support warm regards
completed transaction no i deeply appreciate your quick service as i have received the pre approved loan amount of inr
imploring update on transaction i request you to kindly send the status of my transac

transaction no is unresolved the transaction is taking too long to complete i would request you to kindly guide me through the further steps to be taken in order to complete the transaction
change address for account no hey i think i lost my atm card today so can you please block my account or tell me a way to change my pin no i urgently need your reply thanks in advance
transaction having id has stopped help this is in response to your email stating that my transaction having id has failed but no reason was mentioned can you please tell me what did i do wrong so that i can create a new one without any errors waiting for your reply thank you in advance
completed transaction no i deeply appreciate your quick service as i have received the pre approved loan amount of usd
pending payment for transaction having id hello greetings for the day status of transaction inr for account is pending i would be grateful if you could tell me the cause thanks a lot
handling the transaction after paymen

urgently required update on transaction i need details of urgently please provide the same on priority basis
accepted payment transaction currently processing thank you for transferring the payment to my account yesterday i appreciate the quick response i would like to inform you that is is currently in process thanks and regards
seeking update on the status of transaction kindly reply to me at the earliest with the last transactions made with the account no thanks in advance
partially paid the required amount for transaction hello greetings for the day status of transaction inr for account is pending i would be grateful if you could tell me the cause thanks a lot
why has my transaction stopped this is in response to your email stating that my transaction having id has failed but no reason was mentioned can you please tell me what did i do wrong so that i can create a new one without any errors waiting for your reply thank you in advance
required money acquired transaction is in proces

request to send details of transaction i need details of urgently please provide the same on priority basis
the pending amount for transaction will reach you soon the transaction is taking too long to complete i would request you to kindly guide me through the further steps to be taken in order to complete the transaction
urgently required update on transaction i request you to kindly send the status of my transaction with id thanks and regards
urgent transaction ceased hey i see my transaction with id has failed i think i did everything right please look into this and reply with the reason urgently thank you and regards
request to send details of transaction hey i would be really grateful if you could tell me the details of account no thanks
transaction has failed i have been your regular client and have followed the procedure for creating a transaction correctly still i received an email saying that the transaction has failed i would like to know the reasoning my transaction id is re

pending payment for transaction having id the transaction is taking too long to complete i would request you to kindly guide me through the further steps to be taken in order to complete the transaction
how to change pin no of atm card kindly add the contact no to my bank account
imploring update on transaction i need details of urgently please provide the same on priority basis
transaction is complete i deeply appreciate your quick service as i have received the pre approved loan amount of inr
transaction is now being processed thank you for transferring the payment to my account yesterday i appreciate the quick response i would like to inform you that is is currently in process thanks and regards
dealing with the transaction hello this is to inform you that i have received the amount you transferred to my account and now it is currently in process
processing transaction having id sincere greetings i am glad to tell you that i have finally been granted the pre approved amount through 

transaction stalled and payment not received hey i see my transaction with id has failed i think i did everything right please look into this and reply with the reason urgently thank you and regards
payment outstanding for transaction hey sincere apologies for transferring a fraction of the amount we agreed on please be assured that the rest is being transferred as of now thanks a lot for understanding
payment is pending for transaction hello greetings for the day status of transaction eur for account is pending i would be grateful if you could tell me the cause thanks a lot
concluded the transaction to whom it may concern i have successfully received payment for the transaction i am grateful for your cooperation thank you so much and regards
transaction stalled and payment not received i have been your regular client and have followed the procedure for creating a transaction correctly still i received an email saying that the transaction has failed i would like to know the reasoning m

required money acquired transaction is in process this is in response to your email notifying about the transferred amount i am acknowledging it and informing you that it is now in process
dealing with the transaction this is in response to your email notifying about the transferred amount i am acknowledging it and informing you that it is now in process
transaction stalled and payment not received this is in response to your email stating that my transaction having id has failed but no reason was mentioned can you please tell me what did i do wrong so that i can create a new one without any errors waiting for your reply thank you in advance
why cant i withdraw money with my atm card i need a new cheque book of leaves kindly send the same to my current address which is present in the details of my account
asking for the details for transaction i request you to kindly send the status of my transaction with id thanks and regards
transaction is complete hello sincere greetings for the day

payment received for transaction and now processing thank you for transferring the payment to my account yesterday i appreciate the quick response i would like to inform you that is is currently in process thanks and regards
urgently required update on transaction urgently require details of acc reply asap
asking for the details for transaction urgently require details of acc reply asap
payment is pending for transaction the transaction is taking too long to complete i would request you to kindly guide me through the further steps to be taken in order to complete the transaction
transaction is complete to whom it may concern i have successfully received payment for the transaction i am grateful for your cooperation thank you so much and regards
fulfilled transaction having id to whom it may concern i have successfully received payment for the transaction i am grateful for your cooperation thank you so much and regards
soliciting information for id kindly reply to me at the earliest wit

incomplete transaction there has been only a partial payment of amount inr assuring you that the rest will be paid later warm regards
soliciting information for id hey i would be really grateful if you could tell me the details of account no thanks
seeking update on the status of transaction can you please tell me the amount transferred through transaction id thanks
received full payment for transaction no hello sincere greetings for the day i would like to inform you that my transaction has completed thank you so much for your support looking forward to working more with you in the future regards
concluded the transaction i deeply appreciate your quick service as i have received the pre approved loan amount of gbp
soliciting information for id can you please tell me the amount transferred through transaction id thanks
want to block account i would like to know the instructions to be followed to install and setup mobile banking please reply to me with the same thanks in advance
request

partially paid the required amount for transaction the transaction is taking too long to complete i would request you to kindly guide me through the further steps to be taken in order to complete the transaction
incomplete transaction the transaction is taking too long to complete i would request you to kindly guide me through the further steps to be taken in order to complete the transaction
pending payment for transaction having id hey sincere apologies for transferring a fraction of the amount we agreed on please be assured that the rest is being transferred as of now thanks a lot for understanding
why has my transaction stopped this is to notify you that my transaction has failed please reply to me with the cause as soon as possible
transaction stalled and payment not received i have been your regular client and have followed the procedure for creating a transaction correctly still i received an email saying that the transaction has failed i would like to know the reasoning my tran

asking for the details for transaction can you please tell me the amount transferred through transaction id thanks
handling the transaction after payment thank you for transferring the payment to my account yesterday i appreciate the quick response i would like to inform you that is is currently in process thanks and regards
abrupt closure of transaction with id this is in response to your email stating that my transaction having id has failed but no reason was mentioned can you please tell me what did i do wrong so that i can create a new one without any errors waiting for your reply thank you in advance
request to send details of transaction i request you to kindly send the status of my transaction with id thanks and regards
rectify mistake i am holding a savings account in our bank for the past four years i got my pass book updated recently to file my income tax return but the entries from the period of th jan to th feb were missing kindly rectify this error and issue me a fresh pri

soliciting information for id can you please tell me the amount transferred through transaction id thanks
soliciting information for id kindly reply to me at the earliest with the last transactions made with the account no thanks in advance
failure of transaction i am clueless as to why i received an email saying that my transaction has failed i would like to know the reasoning my transaction id is regards
incomplete transaction there has been only a partial payment of amount usd it is my gurantee that the rest will be paid later warm regards
failure of transaction this is in response to your email stating that my transaction having id has failed but no reason was mentioned can you please tell me what did i do wrong so that i can create a new one without any errors waiting for your reply thank you in advance
fulfilled transaction having id hey i am writing in reference to the transaction i was granted the aforementioned amount within the deadline sincere gratitude for such a quick resp

In [99]:
df['Text_Data'] = text_clean

In [100]:
df.head()

Unnamed: 0,From,To,Subject,Body,TransactionID,Payment Date,Class,Text_Data
0,Spike@DEUTSCHEBANK.com,Shaniece@CitiBankPune.com,Transaction no. 072558 is unresolved.,Sorry to inform that there has been only a par...,72558,04-02-2020,Pending,transaction unresolved sorry inform partial pa...
1,Stacy@HDFC.com,Rohan@CitiBankNewYork.com,Order for new Cheque book,"Good morning, I want to place an order for an ...",212096,29-05-2020,General,order new cheque book good morning want place ...
2,Zachary@HDFC.com,Rishabh@CitiBankPune.com,Required money acquired. Transaction 847047 is...,Hello! This is to inform you that I have recei...,847047,26-01-2020,Processing,require money acquire transaction process info...
3,Stacy@SBI.com,Shai@CitiBankHongKong.com,Asking for the details for transaction 746078,I request you to kindly send the status of my ...,746078,17-06-2019,Request,ask detail transaction request kindly send sta...
4,Angela@HDFC.com,Dipesh@CitiBankSingapore.com,Partial payment for transaction 535918,Hello!! Greetings for the day. Status of trans...,535918,18-02-2020,Pending,partial payment transaction day status transac...


In [101]:
df['Text_Data'][0]

'transaction unresolved sorry inform partial payment definitely go receive rest later point time warm'

In [102]:
# from sklearn.preprocessing import OneHotEncoder 
# le = OneHotEncoder()
# classes = df['Class'].values

# classes = classes.reshape(-1, 1)
# labels = le.fit_transform(classes)
# labels = labels.toarray()

In [103]:
labels = list(df['Class'].values)

In [104]:
labels[0]

'Pending'

In [106]:
print("Extracting features from the training dataset using a sparse vectorizer")

t0 = time()

if opts.use_hashing:
    if opts.use_idf:
        # Perform an IDF normalization on the output of HashingVectorizer
        hasher = HashingVectorizer(n_features=opts.n_features, stop_words=my_stop, alternate_sign=False, norm=None)
        vectorizer = make_pipeline(hasher, TfidfTransformer())
    
    else:
        vectorizer=HashingVectorizer(n_features=opts.n_features, stop_words=my_stop, 
                                     alternate_sign=False, norm='l2')

else:       
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
                                 min_df=2, stop_words=my_stop, use_idf=opts.use_idf)
    

X = vectorizer.fit_transform(text_clean)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

if opts.n_components:
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    # Vectorizer results are normalized, which makes KMeans behave as
    # spherical k-means for better results. Since LSA/SVD results are 
    # not normalized, we have to redo the normalization.
    
    svd = TruncatedSVD(opts.n_components)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)

    print("done in %fs" % (time() - t0))

    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(
        int(explained_variance * 100)))

    print()

Extracting features from the training dataset using a sparse vectorizer
done in 0.041258s
n_samples: 951, n_features: 212



  'stop_words.' % sorted(inconsistent))


In [108]:
# Do the actual clustering

if opts.minibatch:
    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                         init_size=5, batch_size=5, verbose=opts.verbose)
else:
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose=opts.verbose)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()


if not opts.use_hashing:
    print("Top terms per cluster:")

    if opts.n_components:
        original_space_centroids = svd.inverse_transform(km.cluster_centers_)
        order_centroids = original_space_centroids.argsort()[:, ::-1]
    else:
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()

Clustering sparse data with MiniBatchKMeans(batch_size=5, compute_labels=True, init='k-means++',
        init_size=5, max_iter=100, max_no_improvement=10, n_clusters=6,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=False)
done in 0.139s

Homogeneity: 0.341
Completeness: 0.569
V-measure: 0.426
Adjusted Rand-Index: 0.125
Silhouette Coefficient: 0.092

Top terms per cluster:
Cluster 0: process currently transfer inform payment receive acknowledge response finally kind
Cluster 1: pay partial rest understanding assure remain later payment regret soon
Cluster 2: fail kindly detail reply complete request tell know status update
Cluster 3: finalized confirm want let acknowledge status know response email payment
Cluster 4: wrong mention error reason state new create advance wait fail
Cluster 5: priority basis detail provide urgently need update request send require


  init_size=init_size)
