In [2]:
import re
import numpy as np
import tflearn
from tflearn.data_utils import to_categorical
import nltk
from nltk.corpus import stopwords
from contractions import CONTRACTION_MAP
import string
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from sklearn.model_selection import train_test_split
from enchant.checker import SpellChecker


# create data corpus
notes = [note for note in open("cleanNotes.txt")]

labels = [int(re.sub("\n",'',label)) for label in open("labels.txt")]
labels = to_categorical(labels, 2)



def stemWord(note):
    tokens = tokenize_text(note)
    new_words = []
    stemmer = PorterStemmer()
    for word in tokens:
        #new_words.append(word)
        new_word = stemmer.stem(word)
        new_words.append(new_word)
    
    return " ".join([word for word in new_words])
    
def lemmatize(words):
    tokens = tokenize_text(note)
    new_words = []
    lemmatizer = WordNetLemmatizer()
    for word in tokens:
        new_word = lemmatizer.lemmatize(word, pos='v')
        new_words.append(new_word)
    
    return " ".join([word for word in new_words])

# normalize corpus
def expand_contractions(text, contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
            if contraction_mapping.get(match)\
            else contraction_mapping.get(match.lower()) 
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
    
def tokenize_text(text):
    
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    return tokens
    
    
def remove_special_chars(note):
    tokens = tokenize_text(note)
    filtered_Words =[]
    for token in tokens:
        new_word = ''.join([ch for ch in token if ch not in string.punctuation])
        filtered_Words.append(new_word)
    
    return " ".join([word for word in filtered_Words])
        
def remove_stopwords(note):
    tokens = tokenize_text(note)
    filtered = [token for token in tokens if token not in stopwords.words('english')]
    
    return " ".join([tk for tk in filtered])
    
def correct_spell(text):
    chkr = SpellChecker("en-US","en-UK")
    chkr.set_text(text)
    for err in chkr:
        #print err 
        if len(err.suggest()) > 0:
            sug = err.suggest()[0]
            #print "suggest: " + sug
            err.replace(sug)
    
    correctedNote = chkr.get_text()
    
    return correctedNote 

def normalize(notes):
    normalized_notes =[]
    
    for note in notes:
        note = expand_contractions(note, CONTRACTION_MAP) # To write yourself
        #note = correct_spell(note)
        note = note.lower()
        note = remove_special_chars(note)
        note = remove_stopwords(note)
        #note = stemWord(note)
        #note = lemmatize(note)
        
        # more methods to add
        normalized_notes.append(note)
    
    return normalized_notes
    

normalized_notes = normalize(notes)
#print len(normalized_notes)
#print len(labels)

# build feature matrix
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,1))
feature_matrix = vectorizer.fit_transform(normalized_notes).astype(float).toarray()

vocab_size = len(vectorizer.vocabulary_)

# split data into 70:30
x_train , x_test , y_train , y_test = train_test_split(feature_matrix, labels, test_size=0.3, random_state=21, stratify=labels)

# set up neural network
tf.reset_default_graph()
net = tflearn.input_data(shape=[None, vocab_size])
net = tflearn.fully_connected(net , 30)
net = tflearn.fully_connected(net , 30)
net = tflearn.fully_connected(net , 2 , activation='softmax')
net = tflearn.regression(net)

# train
model = tflearn.DNN(net)
model.fit(x_train, y_train, n_epoch=10, batch_size=25, show_metric=True)

#test
model.evaluate(x_test, y_test)


Training Step: 89  | total loss: [1m[32m0.29977[0m[0m | time: 0.058s
| Adam | epoch: 010 | loss: 0.29977 - acc: 0.8798 -- iter: 200/209
Training Step: 90  | total loss: [1m[32m0.28263[0m[0m | time: 0.066s
| Adam | epoch: 010 | loss: 0.28263 - acc: 0.8918 -- iter: 209/209
--


[0.7222222089767456]