In [83]:
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import normalize, MaxAbsScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [197]:
def train_test_LR(featMap, trueLabels, testSize=.33):
    train, test, train_labels, test_labels = train_test_split(featMap,trueLabels,test_size=testSize,random_state=42)
    lreg = LogisticRegression(tol=0.001)
    lreg.fit(train,train_labels)
    test_predict = lreg.predict(test)
    confMat = confusion_matrix(test_labels,test_predict).astype(float)
    normConfMat = confMat.copy()
    normConfMat[0,:] /= confMat[0,:].sum()
    normConfMat[1,:] /= confMat[1,:].sum()
    return confMat, normConfMat, lreg.score(test, test_labels)

In [166]:
datasets=['3WordThresholdedEnglishReviews', '3WordThresholdedEnglishReviews_k123', 'EnglishReviews', 'EnglishReviews_k123']

In [198]:
results = []
for i in range(len(datasets)):
    min_max_scaler = MaxAbsScaler()
    print "\n\n"
    print "Loading and normalizing dataset: "+ datasets[i]
    featureMatrix = sps.load_npz('./Datasets/'+datasets[i]+'_Features.npz')
    featureMatrix = min_max_scaler.fit_transform(featureMatrix)
    labels = np.load('./Datasets/'+datasets[i]+'_Labels.npz')['arr_0']
    print "LR on "+ datasets[i]
    cm, ncm, sc = train_test_LR(featureMatrix, labels)
    print "    Normalized Confusion Matrix:"
    print "    ", ncm[0,0], ncm[0,1]
    print "    ", ncm[1,0], ncm[1,1]
    print "    Accuracy: ", sc
    results.append([datasets[i], cm.copy(), ncm.copy(), sc])




Loading and normalizing dataset: 3WordThresholdedEnglishReviews
LR on 3WordThresholdedEnglishReviews
    Normalized Confusion Matrix:
     0.900292992776 0.0997070072242
     0.122076586996 0.877923413004
    Accuracy:  0.887626605611



Loading and normalizing dataset: 3WordThresholdedEnglishReviews_k123
LR on 3WordThresholdedEnglishReviews_k123
    Normalized Confusion Matrix:
     0.960993938852 0.0390060611483
     0.0526685513578 0.947331448642
    Accuracy:  0.953257790368



Loading and normalizing dataset: EnglishReviews
LR on EnglishReviews
    Normalized Confusion Matrix:
     0.771038346181 0.228961653819
     0.0432881234833 0.956711876517
    Accuracy:  0.864156725132



Loading and normalizing dataset: EnglishReviews_k123
LR on EnglishReviews_k123
    Normalized Confusion Matrix:
     0.959658428972 0.0403415710284
     0.0631829212708 0.936817078729
    Accuracy:  0.948203110038
