In [None]:
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
def train_test_LR(featMap, trueLabels, testSize=.33):
    train, test, train_labels, test_labels = train_test_split(featMap,trueLabels,test_size=testSize,random_state=42)
    train = MaxAbsScaler().fit_transform(train)
    test = MaxAbsScaler().fit_transform(test)
    
    lreg = LogisticRegression(tol=0.001)
    lreg.fit(train,train_labels)
    test_predict = lreg.predict(test)
    confMat = confusion_matrix(test_labels,test_predict).astype(float)
    normConfMat = confMat.copy()
    normConfMat[0,:] /= confMat[0,:].sum()
    normConfMat[1,:] /= confMat[1,:].sum()
    return confMat, normConfMat, lreg.score(test, test_labels)

In [7]:
prefix = './Datasets/'
datasets1 = [prefix + '3WordThresholdedEnglishReviews', 
             prefix + '3WordThresholdedEnglishReviews_k123', 
             prefix + 'EnglishReviews', 
             prefix + 'EnglishReviews_k123']

prefix = './Datasets_2/'
datasets2 = [prefix + 'UkReviews_Thresholded_BOW3', 
             prefix + 'UkReviews_Thresholded_BOW123', 
             prefix + 'UkReviews_Raw_BOW3', 
             prefix + 'UkReviews_Raw_BOW123']

In [8]:
results = []
for i in range(len(datasets)):
    min_max_scaler = MaxAbsScaler()
    print "\n\n"
    print "Loading and normalizing dataset: "+ datasets[i]
    featureMatrix = sps.load_npz(datasets2[i]+'_Features.npz')
    #featureMatrix = min_max_scaler.fit_transform(featureMatrix)
    labels = np.load(datasets2[i]+'_Labels.npz')['arr_0']
    print "LR on "+ datasets[i]
    cm, ncm, sc = train_test_LR(featureMatrix, labels)
    print "    Normalized Confusion Matrix:"
    print "    ", ncm[0,0], ncm[0,1]
    print "    ", ncm[1,0], ncm[1,1]
    print "    Accuracy: ", sc
    results.append([datasets[i], cm.copy(), ncm.copy(), sc])




Loading and normalizing dataset: ./Datasets/3WordThresholdedEnglishReviews
LR on ./Datasets/3WordThresholdedEnglishReviews
    Normalized Confusion Matrix:
     0.900411136435 0.0995888635653
     0.122102592237 0.877897407763
    Accuracy:  0.887671950486



Loading and normalizing dataset: ./Datasets/3WordThresholdedEnglishReviews_k123
LR on ./Datasets/3WordThresholdedEnglishReviews_k123
    Normalized Confusion Matrix:
     0.960696250614 0.0393037493855
     0.0522562062817 0.947743793718
    Accuracy:  0.953367222211



Loading and normalizing dataset: ./Datasets/EnglishReviews
LR on ./Datasets/EnglishReviews
    Normalized Confusion Matrix:
     0.786778375431 0.213221624569
     0.0475134582456 0.952486541754
    Accuracy:  0.869883790613



Loading and normalizing dataset: ./Datasets/EnglishReviews_k123
LR on ./Datasets/EnglishReviews_k123
    Normalized Confusion Matrix:
     0.958877624371 0.041122375629
     0.0634416152358 0.936558384764
    Accuracy:  0.947684152648
