In [53]:
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [19]:
englishReviews = pd.read_csv('UKReviewsWordCounted.csv')

In [44]:
def wordSplit(rev, stop_words = stopwords.words('English')):
    s = [item for sublist in [rev.lower().split() for f in re.findall('\d+|\D+',rev.lower())] for item in sublist]
    s = [x for x in s if x not in stop_words]
    return s

def bagOfWordsShingles(review, k=3, stop_words = stopwords.words('English')):
    s = wordSplit(review, stop_words)
    if len(s) < k:
        tokens = ['']
        for s_ in s:
            tokens[0] += s_ + ' '
        tokens[0] = tokens[0][:-1]
        return tokens
        
    tokens = []
    for i in range(len(s) - k + 1):
        bag = ''
        for j in range(i, i + k):
            bag += s[j]+' '
        tokens.append(bag[:-1])
    return tokens

def bagOfWordsShingles2(review, k=3, stop_words = stopwords.words('English')):
    s = wordSplit(review, stop_words)
    tokens = []
    for K in range(0,k):
        for i in range(len(s) - (k-K) + 1):
            bag = ''
            for j in range(i, i + k-K):
                bag += s[j]+' '
            tokens.append(bag[:-1])

    return tokens

In [82]:
def separateData(inputReviews, testSize = .33, wordThreshold = 3):
    thrPosEngRevs = inputReviews.loc[inputReviews['Positive_Processed_Word_Count'] > wordThreshold]['Positive_Review']
    thrNegEngRevs = inputReviews.loc[inputReviews['Negative_Processed_Word_Count'] > wordThreshold]['Negative_Review']

    allReviews = pd.concat([thrPosEngRevs, thrNegEngRevs])
    labels = np.hstack([np.ones(thrPosEngRevs.shape[0]), np.zeros(thrNegEngRevs.shape[0])])
    train, test, train_labels, test_labels = train_test_split(range(len(labels)),labels,test_size=testSize,random_state=42)
    
    return allReviews.iloc[train], allReviews.iloc[test], train_labels, test_labels

def extractFeatures(trData, teData, featureType=1):
    if featureType == 1:
        tfidf = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=bagOfWordsShingles, lowercase=True)
    elif featureType == 2:
        tfidf = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=bagOfWordsShingles2, lowercase=True)
    else:
        print "Please select a feature extractor type"
        return -1,-1
    print "        Extracting Training Features"
    trFeatures = tfidf.fit_transform(trData)
    
    print "        Extracting Testing Features"
    if featureType == 1:
        teFeatures = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=bagOfWordsShingles, lowercase=True, vocabulary = tfidf.vocabulary_).fit_transform(teData)
    elif featureType == 2:
        teFeatures = TfidfVectorizer(stop_words=stopwords.words('English'),tokenizer=bagOfWordsShingles2, lowercase=True, vocabulary = tfidf.vocabulary_).fit_transform(teData)
    else:
        print "Please select a feature extractor type"
        return -1,-1

    return trFeatures, teFeatures

In [83]:
def train_test_logistic_regression(englishReviews, wordThreshold, featureType):
    print "wordThreshold = ", wordThreshold, "featureType = ", featureType
    trD, teD, trL, teL = separateData(englishReviews, wordThreshold)
    trFeatures, teFeatures = extractFeatures(trD, teD, featureType)
    print "    Fitting the regressor"
    lreg = LogisticRegression(tol=0.001)
    lreg.fit(trFeatures,trL)
    print "    Testing the fitting"
    test_predict = lreg.predict(teFeatures)
    confMat = confusion_matrix(teL,test_predict).astype(float)
    normConfMat = confMat.copy()
    normConfMat[0,:] /= confMat[0,:].sum()
    normConfMat[1,:] /= confMat[1,:].sum()
    return confMat, normConfMat, lreg.score(teFeatures, teL)

In [84]:
results = []

cm,ncm,scr = train_test_logistic_regression(englishReviews,wordThreshold=3,featureType=1)
results.append([cm,ncm,scr])

cm,ncm,scr = train_test_logistic_regression(englishReviews,wordThreshold=0,featureType=1)
results.append([cm,ncm,scr])

cm,ncm,scr = train_test_logistic_regression(englishReviews,wordThreshold=3,featureType=2)
results.append([cm,ncm,scr])

cm,ncm,scr = train_test_logistic_regression(englishReviews,wordThreshold=0,featureType=2)
results.append([cm,ncm,scr])

for r in results:
    print
    print
    print r[0]
    print r[1]
    print r[2]

wordThreshold =  3 featureType =  1
        Extracting Training Features
        Extracting Testing Features
    Fitting the regressor
    Testing the fitting
wordThreshold =  0 featureType =  1
        Extracting Training Features
        Extracting Testing Features


ValueError: Found array with 0 sample(s) (shape=(0, 2704964)) while a minimum of 1 is required by the normalize function.

In [85]:
for r in results:
    print
    print
    print r[0]
    print r[1]
    print r[2]



[[ 2.  0.]
 [ 0.  1.]]
[[ 1.  0.]
 [ 0.  1.]]
1.0


In [5]:
def train_test_LR(featMap, trueLabels, testSize=.33):
    train, test, train_labels, test_labels = train_test_split(featMap,trueLabels,test_size=testSize,random_state=42)
    train = MaxAbsScaler().fit_transform(train)
    test = MaxAbsScaler().fit_transform(test)
    
    lreg = LogisticRegression(tol=0.001)
    lreg.fit(train,train_labels)
    test_predict = lreg.predict(test)
    confMat = confusion_matrix(test_labels,test_predict).astype(float)
    normConfMat = confMat.copy()
    normConfMat[0,:] /= confMat[0,:].sum()
    normConfMat[1,:] /= confMat[1,:].sum()
    return confMat, normConfMat, lreg.score(test, test_labels)

In [3]:
datasets=['3WordThresholdedEnglishReviews', '3WordThresholdedEnglishReviews_k123', 'EnglishReviews', 'EnglishReviews_k123']

In [6]:
results = []
for i in range(len(datasets)):
    min_max_scaler = MaxAbsScaler()
    print "\n\n"
    print "Loading and normalizing dataset: "+ datasets[i]
    featureMatrix = sps.load_npz('./Datasets/'+datasets[i]+'_Features.npz')
    #featureMatrix = min_max_scaler.fit_transform(featureMatrix)
    labels = np.load('./Datasets/'+datasets[i]+'_Labels.npz')['arr_0']
    print "LR on "+ datasets[i]
    cm, ncm, sc = train_test_LR(featureMatrix, labels)
    print "    Normalized Confusion Matrix:"
    print "    ", ncm[0,0], ncm[0,1]
    print "    ", ncm[1,0], ncm[1,1]
    print "    Accuracy: ", sc
    results.append([datasets[i], cm.copy(), ncm.copy(), sc])




Loading and normalizing dataset: 3WordThresholdedEnglishReviews
LR on 3WordThresholdedEnglishReviews
    Normalized Confusion Matrix:
     0.899443090067 0.100556909933
     0.118478540221 0.881521459779
    Accuracy:  0.889295277271



Loading and normalizing dataset: 3WordThresholdedEnglishReviews_k123
LR on 3WordThresholdedEnglishReviews_k123
    Normalized Confusion Matrix:
     0.960390060611 0.0396099393885
     0.0506981924098 0.94930180759
    Accuracy:  0.954111529357



Loading and normalizing dataset: EnglishReviews
LR on EnglishReviews
    Normalized Confusion Matrix:
     0.786778375431 0.213221624569
     0.0475134582456 0.952486541754
    Accuracy:  0.869883790613



Loading and normalizing dataset: EnglishReviews_k123
LR on EnglishReviews_k123
    Normalized Confusion Matrix:
     0.958877624371 0.041122375629
     0.0634416152358 0.936558384764
    Accuracy:  0.947684152648
