# Regex-ML Hybrid Classifier Model


### An attempt at combining regex searches for targetted patterns with ML models

Starting with a simple majority vote idea -- votes from DT, SGD, and regex classifiers. 
ML models pushed through pipelines to get best params, then test set is classified with best estimator given by randomizedSearchCV.
As of now, each classifiers vote is weighted equally. Would like to tweak this, as well as give more weight to specific regex patterns over others

In [13]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import PunktSentenceTokenizer, sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download('maxent_treebank_pos_tagger')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

import pandas as pd
import numpy as np
import scipy 
import sklearn
import random
from pprint import pprint
import string
import re


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RandomizedSearchCV

from sklearn import metrics



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aubrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     C:\Users\Aubrey\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Aubrey\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aubrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
def getData():
    
    data = pd.read_csv("BookReviews.csv", encoding = 'latin1')
    
    data = data.loc[:, ~data.columns.str.contains('^Unnamed')]  #had several (about 4) columns titles "Unnamed"
    transltr = str.maketrans('', '', string.punctuation)
    for i in range(data['text'].size):
        data['text'].iloc[i] = data['text'].iloc[i].lower()
        data['text'].iloc[i] = data['text'].iloc[i].replace("%", "PERCENT")
        data['text'].iloc[i] = data['text'].iloc[i].translate(transltr)
        if data['label'].iloc[i] == "i":
            data['label'].iloc[i] = 1
        else: 
            data['label'].iloc[i] = 0
            
    data = data.sample(frac = 1).reset_index(drop = True)
    return data

In [15]:
def partitionData(df):
    
    txtSeries = df.loc[:,"cleanTxt"]
    labelSeries = df.loc[:,"label"]
    
    txtList = [txtSeries.iloc[i] for i in range(txtSeries.size)]
    labelList = [labelSeries.iloc[i] for i in range(labelSeries.size)]
           
    piv = (len(txtSeries) // 3) * 2

    trainTxt = txtList[:piv]
    testTxt = txtList[piv:]

    trTarg = labelList[:piv]
    tsTarg = labelList[piv:]
    
    return [[trainTxt, trTarg],[testTxt, tsTarg]]

In [16]:
def regexCLF(txt):
    reg1 = r'dnf'
    reg2 = r'g[ai]ve up'
    reg3 = r'(finish|listen|read|dnf)[\w\s]*(chapter|p(age|g)|PERCENT|half ?(way)?)'
    reg4 = r'(have|could|did|never|can(not|t))[\w\s]*(finish|g[eo]t (into|through)|go on)'


    reg = reg1 + r'|' + reg2 + r'|'+ reg3 + r'|'+ reg4

    match = []
    
    for sample in txt:
        matchBool = 0
        if re.findall(reg, sample):
            matchBool = 1
        
        match.append(matchBool)
        
    return match

In [17]:
def cleanTxt(df, ps):
    
    cleanTxt = []
    
    for i in df.index:
        
        txt = str(df.iloc[i])  #I had problems where this was a float, which doesn't make much sense
        tggd = tag_txt(txt, ps)
        stemTxt = ""
        for w in tggd:
            stemTxt += str(w[0])
            stemTxt += " "
        cleanTxt.append(stemTxt)
    
    column = pd.Series(cleanTxt)
    
    return column

In [18]:
def tag_txt(txt, ps):
    #is passed a string of raw text
    
    #this is just nltk's English Stop words with a couple things deleted, like "didn't", "couldn't" and their variations    
    custStopWords = ['to', 'further', 'ma', 'a', 'no',  'or', 'ours', 'once', 'before', 'out', "doesn't", 
                      've', 'm', "you've", "needn't", 'you', 'not', 'so', 'off', 'under', 'most', 'which', 
                      'more', 'ourselves', 'about', 'down', 'isn', 'they', 'his', "she's", 'only', 'how', 
                      'had', 'again', 'by', 'after', 'shan', 'their', 'some', "hasn't", 'mustn', 'yours', 
                      'is', 'who', 'we', 'because', "you'll", 'it', 'has', 'both', 'here', "don't", 'than', 
                      'through', 'any', 'did', 'its', 'own', 'being', 'all', 'yourself', 'needn', 'd', 'o', 
                      "weren't",  'itself', 'what', 're', 'my', 'there', 'ain', 'i', "isn't", "aren't", 'if', 
                      'll', 'wasn', 'of', 'your', 'an',  'over', 'wouldn', 'y', "mightn't", 'between', 'mightn',
                      "hadn't", 's', 'on', 'while', 'from', 'have', "shan't", 'then', "mustn't", 'will', 'below',
                      'where', 'been', 'same', 'don', 'myself', 'until', 'other', 'doesn', 'but', 'above', 'can', 
                      'for', 'and', 'against', "you'd", 'him', 'does', 'into', 'are', 'these', 'few', 'himself', 
                      'aren', "wasn't", 'at', 'too', "should've", 'should', 'those', "that'll", 'me', 'hasn', 'shouldn',
                      'themselves', 'weren', 'our', 'as', 'be', "it's", 'the', 'was', 'up', 'hadn', 'am',
                      'this', 'yourselves', 'that', "you're", 'having', 'each', 'do', 'she', 'them', 'very',
                      'nor', 'he', 'whom', 'now', 'won', 'during', 'her', 'hers', 'were', 'just', 'with', 
                      'why', "wouldn't", 'when', 'herself', "won't", "shouldn't", 'such', 'doing', 'in', 'theirs']
    
    trashPOStags = ['NNP', 'NNPS', 'PRP', 'PRP$', 'WP$', 'WP', 'WDT']

    taggedLst= []
    toked = PunktSentenceTokenizer().tokenize(txt)  #tokenized into sentences
    
    for s in toked: 
        
        words = word_tokenize(s)  #tokenize into words
        
        words = [words[i] for i in range(len(words)) if words[i] not in string.punctuation]
        
        words = nltk.pos_tag(words) #creates a list of (word, posTag) pairs
        
        words = [(ps.stem(word[0]), word[1]) for word in words if word[0] not in custStopWords] #stem the words.

        for word in words:  #word -> (word, pos)
            if  word[1] not in trashPOStags: #if not a proper noun, etc
                taggedLst.append(word)
    

    return taggedLst

In [19]:
def getDTclf(trTxt, trTrgs):

    dtParams =  {'vect__ngram_range': [(1, i+2) for i in range(4)],
                'clf__max_depth': [i+2 for i in range(10)],
                "clf__min_samples_leaf": [(i+1) for i in range(10)],
                "clf__max_features": [(i+5) for i in range(100)]}

    dt_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf', 
                          DecisionTreeClassifier())])


    randSearch = RandomizedSearchCV(estimator = dt_clf, 
                       param_distributions = dtParams,
                       n_iter = 100, cv = 3, verbose=2,
                       random_state=42, n_jobs = -1)
    # Fit the random search model
    randSearch.fit(trTxt, trTrgs)

    best_random = randSearch.best_estimator_

    best = best_random.fit(trTxt, trTrgs)

    
    print("     DT Training Summary")
    print("******************************")
    print("Best score from randSearch:")
    print(randSearch.best_score_)
    print()
    print("Best params:")
    print(randSearch.best_params_)
    print()
    
    return best


In [20]:
def getSGDclf(trTxt, trTrgs):
    sgdParams = {'vect__max_features': [i+5 for i in range(15)]+[None],
                 'vect__ngram_range': [(1, i+2) for i in range(4)],
                  'tfidf__use_idf': (True, False),
                  'clf__alpha': (1e-2, 1e-3),
                "clf__loss": ["hinge", "log", "squared_hinge"],
                "clf__penalty": ["none", "elasticnet"]}
    sgd_clf = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier())])

    randSearch = RandomizedSearchCV(estimator = sgd_clf, 
                       param_distributions = sgdParams,
                       n_iter = 100, cv = 3, verbose=2,
                       random_state=42, n_jobs = -1)
    # Fit the random search model
    randSearch.fit(trTxt, trTrgs)

    best_random = randSearch.best_estimator_

    best = best_random.fit(trTxt, trTrgs)
    
    print("   SGD Training Summary")
    print("**************************")
    print("Best score from randSearch:")
    print(randSearch.best_score_)
    print()
    print("Best params:")
    print(randSearch.best_params_)
    print()
    
    return best
    

In [21]:
def majorityClassify(dt, sgd, re):
    
    votes = []
    
    for i in range(len(dt)):
        total = dt[i] + sgd[i] + re[i]
        votes.append(total)
        
    for n in range(len(votes)):
        if votes[n] > 1:
            votes[n] = 1
        else: votes[n] = 0
    return votes
            

In [27]:
def metrics(trueLabel, predict):
    
    tpi = 0
    fpi = 0
    fni = 0
    
    tpc = 0
    fpc = 0
    fnc = 0


    for i in range(len(trueLabel)):
        if predict[i] == 1:
            if trueLabel[i] == 1:
                tpi += 1
            else: fpi += 1 #fni
        else:
            if trueLabel[i] == 1:
                fni += 1 #fpc
            else:
                tpc += 1

    pi = tpi/(tpi + fpi)
    ri = tpi/(tpi + fni)
    f1i = 2*(pi*ri)/(pi+ri)
    
    pc = tpc/(tpc + fni)
    rc = tpi/(tpc + fpi)
    f1c = 2*(pc*rc)/(pc+rc)

    print("   For incomplete reviews:")
    print("*******************************")
    print()
    print("precision")
    print(pi)
    print("recall")
    print(ri)
    print("f1")
    print(f1i)
    print()
    print()
    print("   For complete reviews:")
    print("*******************************")
    print()
    print("precision")
    print(pc)
    print("recall")
    print(rc)
    print("f1")
    print(f1c)

In [28]:
def main():
    
    
    reviews = getData()
    ps = PorterStemmer()
    reviews.loc[:, "cleanTxt"] = cleanTxt(reviews.loc[:, 'text'], ps)
    trn, tst = partitionData(reviews)
    
    dtCLF = getDTclf(trn[0], trn[1])
    sgdCLF = getSGDclf(trn[0], trn[1])
    
    dtVotes = dtCLF.predict(tst[0])
    sgdVotes = sgdCLF.predict(tst[0])
    regexVotes = regexCLF(tst[0])
    
    votes = majorityClassify(dtVotes, sgdVotes, regexVotes)
    
    metrics(tst[1], votes)

In [29]:
main()

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   30.1s finished


     DT Training Summary
******************************
Best score from randSearch:
0.688442211055

Best params:
{'vect__ngram_range': (1, 2), 'clf__min_samples_leaf': 10, 'clf__max_features': 93, 'clf__max_depth': 2}

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   27.4s finished


   SGD Training Summary
**************************
Best score from randSearch:
0.839195979899

Best params:
{'vect__ngram_range': (1, 4), 'vect__max_features': None, 'tfidf__use_idf': False, 'clf__penalty': 'elasticnet', 'clf__loss': 'hinge', 'clf__alpha': 0.001}

   For incomplete reviews:
*******************************

precision
0.8928571428571429
recall
0.373134328358209
f1
0.5263157894736842


   For complete reviews:
*******************************

precision
0.7543859649122807
recall
0.1893939393939394
f1
0.3027742571468808
