This code was origionally written by Bryce Steslicki as part of an exploration into using AI to classify spend.


To Do:
- Use NLTK to refine the text
- Incorporate the text cleaning into the pipeline<br><b>I wrote code to lemmatize the data before running it into the pipeline.  It took too long to run it as part of the pipeline, probably because I didnt know what I was doing.</b>

- Incorporate other features into the model, like price or customer name/department
- More data and more classes
- visualization
- add the tfidf Transform
<b>Did this and it seems to improve the results slightly.</b>
- add code to identify mis-classified training data
- add a title and documentation paragraph to remind us what the hell this is and why we did it.
- find metrics regarding variance vs. bias.

In [None]:
## Import all necessary modules and define our test and training sets.
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import platform
from pprint import pprint
from time import time
from nltk import word_tokenize          
import nltk
from nltk.stem import WordNetLemmatizer

#Print the version of software Used
print('Python version:', platform.python_version())
print('pandas version:', pd.__version__)
print('Numpy version:', np.version.version)
print('Scikit-learn version:', sklearn.__version__)

#Set the graphing setting
%matplotlib inline

In [None]:
#Import the data and format it -- engine='c' makes it faster.
description = pd.read_table('DataSets/Descriptions.txt', header=None, names=['Description','Class'], engine='c')

#remove nonAlpha characters and set to lower case
description['Clean'] = description['Description'].str.replace('[\W\s0-9]+', ' ').str.lower()

#lemmatize the clean descriptions
wordnet_lemmatizer = WordNetLemmatizer()       
description['Clean'] = description['Clean'].apply(lambda x: ' '.join([wordnet_lemmatizer.lemmatize(y) for y in x.split()]))
        
#Print out an example of the data
description.head(2)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    description['Clean'], description['Class'], test_size=0.4, random_state=0)


#now that we have clean data, build a pipeline and use it
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
    #('clf', MultinomialNB())
    #('clf', AdaBoostClassifier())
    #('clf', SVC())
])

#Use this dictonary to set the parameters of the various parts of the pipeline
parameters = {
    'vect__max_df': [0.25, 0.5],
    'vect__min_df': [0.00025, 0.0005],
    'vect__analyzer': ['word'],
    'vect__stop_words': ['english'],
    'vect__max_features': [500, 1000, 2000],
    'clf__n_estimators': [25, 100, 250] #RandomForest
#    'clf__alpha': [.50,.75,1], #MultinomialNB
#    'clf__fit_prior': [True,False] #MultinomialNB
#    'clf__C': [.25, .5, .75, 1],
#    'clf__kernel': ['linear'],
#    'clf__probability': [True]
}

#This is the part that actually runs the model
if __name__ == "__main__":
    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best training score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    print()
    print('Testing Set Accuracy Score: %0.3f' % grid_search.score(X_test,y_test))

In [None]:
# Use pandas to write the comma-separated output file
output.to_csv( "Bag_of_Words_model.csv", index=False, quoting=3, escapechar='\\' )