In [14]:
%%writefile find_signature.py
# %load find_signature.py
#!/usr/bin/python

import pickle
import numpy
numpy.random.seed(42)


### the words (features) and authors (labels), already largely processed
### these files should have been created from the previous (Lesson 10) mini-project.
words_file = "../MP10 Text Learning/data/your_word_data.pkl" 
authors_file = "../MP10 Text Learning/data/your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )



### test_size is the percentage of events assigned to the test set (remainder go into training)
### feature matrices changed to dense representations for compatibility with classifier
### functions in versions 0.15.2 and earlier
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test  = vectorizer.transform(features_test).toarray()


### a classic way to overfit is to use a small number
### of data points and a large number of features
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train   = labels_train[:150]




### your code goes here
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(features_train,labels_train)

print "What’s the accuracy of the decision tree you just made?"
score = clf.score(features_test,labels_test)
print score

importances = clf.feature_importances_
indices = numpy.argsort(importances)[::-1]
print ' What’s the importance of the most important feature? What is the number of this feature?'
for i in range(5):
    print "{} feature {} ({})".format(i+1,indices[i],importances[indices[i]])

Overwriting find_signature.py


In [15]:
%%writefile parse_out_email_text.py
# %load parse_out_email_text.py
#!/usr/bin/python

import string

from nltk.stem.snowball import SnowballStemmer

def parseOutText(f):
    """ given an opened email file f, parse out all text below the
        metadata block at the top
        (in Part 2, you will also add stemming capabilities)
        and return a string that contains all the words
        in the email (space-separated) 
        
        example use case:
        f = open("email_file_name.txt", "r")
        text = parseOutText(f)
        
        """


    f.seek(0)  ### go back to beginning of file (annoying)
    all_text = f.read()

    ### split off metadata
    content = all_text.split("X-FileName:")
    words = ""
    if len(content) > 1:
        ### remove punctuation
        text_string = content[1].translate(string.maketrans("", ""), string.punctuation)

        ### project part 2: comment out the line below
        # Switch stemming on/off
        #words = text_string

        ### split the text string into individual words, stem each word,
        ### and append the stemmed word to words (make sure there's a single
        ### space between each stemmed word)
        
        stemmer = SnowballStemmer("english")
        
        split_list = text_string.split();
        words = ''
        for i in split_list:
            words += stemmer.stem(i) + " "



    return words

    

def main():
    ff = open("data/test_email.txt", "r")
    text = parseOutText(ff)
    print text



if __name__ == '__main__':
    main()

Overwriting parse_out_email_text.py
