In [1]:
import pandas as pd
import nltk

In [52]:

import os
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r', encoding='utf-8', errors='ignore') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

dfneg = data2df('./NonPro/', 0) # NEG
dfpos = data2df('./Pro/', 1) # POS

df = pd.concat([dfpos, dfneg], axis=0)
df.sample(frac=0.005)
df.count() # check all files were read into df

file     3661
text     3661
class    3661
dtype: int64

In [53]:
X, y = df['text'], df['class']
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3)

In [54]:
import re
import string
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
def preprocess(text):
    regex = re.compile(r"\s+")                               
    text = regex.sub(' ', text)    
    text = text.lower()          
    regex = re.compile(r"[%s%s]" % (string.punctuation, string.digits))
    text = regex.sub(' ', text)           
    sw = stopwords.words('english')
    text = text.split()                                              
    text = ' '.join([w for w in text if w not in sw]) 
    ' '.join([w for w in text.split() if len(w) >= 2])
    ps = PorterStemmer()
    text = ' '.join([ps.stem(w) for w in text.split()])
    return text

In [45]:
#build Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
clf = Pipeline(steps=[
    ('tfid', TfidfVectorizer(
        preprocessor=preprocess,
        lowercase=True, stop_words='english', 
        use_idf=True, smooth_idf=True, norm='l2',
        min_df=1, max_df=1.0, max_features=None, 
        ngram_range=(1, 1), sublinear_tf=True)),
    ('mdl',MultinomialNB())])
    

In [46]:
# setup grid search
from sklearn.model_selection import GridSearchCV
param_grid = {
    'mdl__alpha':[0.01, 0.1, 0.2, 0.5, 1],
    
    'tfid__norm':['l1', 'l2', None]
}
gscv = GridSearchCV(clf, param_grid, iid=False, cv=5, return_train_score=False)

In [60]:
# search for best parameters/estimator
gscv.fit(Xtrain, ytrain)
print(gscv.best_score_, "\n")
print(gscv.best_params_, "\n")


  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))


KeyboardInterrupt: 

In [58]:
# evaluate best_estimator_ on test data
ypred = gscv.best_estimator_.predict(Xtest)
from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.9599636032757052
[[514  42]
 [  2 541]]
              precision    recall  f1-score   support

           0       1.00      0.92      0.96       556
           1       0.93      1.00      0.96       543

   micro avg       0.96      0.96      0.96      1099
   macro avg       0.96      0.96      0.96      1099
weighted avg       0.96      0.96      0.96      1099



In [59]:
#Try Voting Classifier
clf = Pipeline(steps=[
    ('tfid', TfidfVectorizer(
        preprocessor=preprocess,
        lowercase=True, stop_words='english', 
        use_idf=True, smooth_idf=True, norm='l2',
        min_df=1, max_df=1.0, max_features=None, 
        ngram_range=(1, 1), sublinear_tf=True)),
    ('vc', VotingClassifier(estimators=[('mdl', BaggingClassifier(MultinomialNB(alpha=.1),max_features=.5,max_samples=.5)),
    ('rf', RandomForestClassifier(n_estimators=100))]))
    ])

clf.fit(Xtrain,ytrain)
ypred = clf.predict(Xtest)
from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

  'stop_words.' % sorted(inconsistent))


0.9736123748862603
[[539  17]
 [ 12 531]]
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       556
           1       0.97      0.98      0.97       543

   micro avg       0.97      0.97      0.97      1099
   macro avg       0.97      0.97      0.97      1099
weighted avg       0.97      0.97      0.97      1099

