In [1]:
import numpy as np
import scipy.sparse as sps
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.decomposition import TruncatedSVD
from sklearn import linear_model

In [24]:
def train_test_LR(featMap, trueLabels, testSize=.33):
    train, test, train_labels, test_labels = train_test_split(featMap,trueLabels,test_size=testSize,random_state=42)
    train = MaxAbsScaler().fit_transform(train)
    test = MaxAbsScaler().fit_transform(test)
    
    clf = linear_model.SGDClassifier()#(verbose=True)
    clf.fit(train, train_labels)
    test_predict = clf.predict(test)
    
    confMat = confusion_matrix(test_labels,test_predict).astype(float)
    normConfMat = confMat.copy()
    normConfMat[0,:] /= confMat[0,:].sum()
    normConfMat[1,:] /= confMat[1,:].sum()
    return confMat, normConfMat, clf.score(test, test_labels)

In [3]:
city = 'London_'
prefix = './Datasets/'
datasets1 = [prefix + '3WordThresholdedEnglishReviews', 
             prefix + '3WordThresholdedEnglishReviews_k123', 
             prefix + 'EnglishReviews', 
             prefix + 'EnglishReviews_k123']

prefix = './Datasets_City/'+city
datasets2 = [prefix + 'UkReviews_Thresholded_BOW3', 
             prefix + 'UkReviews_Thresholded_BOW123', 
             prefix + 'UkReviews_Raw_BOW3', 
             prefix + 'UkReviews_Raw_BOW123']

# For the Entire Dataset

In [26]:
results = []
for i in range(len(datasets1)):
    min_max_scaler = MaxAbsScaler()
    print "\n\n"
    print "Loading and normalizing dataset: "+ datasets2[i]
    featureMatrix = sps.load_npz(datasets2[i]+'_Features.npz')
    #featureMatrix = min_max_scaler.fit_transform(featureMatrix)
    labels = np.load(datasets2[i]+'_Labels.npz')['arr_0']
    print "SGD on "+ datasets2[i]
    cm, ncm, sc = train_test_LR(featureMatrix, labels)
    print "    Normalized Confusion Matrix:"
    print "    ", ncm[0,0], ncm[0,1]
    print "    ", ncm[1,0], ncm[1,1]
    print "    Accuracy: ", sc
    results.append([datasets2[i], cm.copy(), ncm.copy(), sc])




Loading and normalizing dataset: ./Datasets_City/London_UkReviews_Thresholded_BOW3
SGD on ./Datasets_City/London_UkReviews_Thresholded_BOW3
    Normalized Confusion Matrix:
     0.651887717966 0.348112282034
     0.0410298282769 0.958970171723
    Accuracy:  0.821895747987



Loading and normalizing dataset: ./Datasets_City/London_UkReviews_Thresholded_BOW123
SGD on ./Datasets_City/London_UkReviews_Thresholded_BOW123
    Normalized Confusion Matrix:
     0.963448210386 0.0365517896136
     0.0563139496313 0.943686050369
    Accuracy:  0.952507416302



Loading and normalizing dataset: ./Datasets_City/London_UkReviews_Raw_BOW3
SGD on ./Datasets_City/London_UkReviews_Raw_BOW3
    Normalized Confusion Matrix:
     0.455207963029 0.544792036971
     0.01412123685 0.98587876315
    Accuracy:  0.720800149162



Loading and normalizing dataset: ./Datasets_City/London_UkReviews_Raw_BOW123
SGD on ./Datasets_City/London_UkReviews_Raw_BOW123
    Normalized Confusion Matrix:
     0.958229648063

# For Cities

In [25]:
results = []
for i in range(len(datasets2)):
    min_max_scaler = MaxAbsScaler()
    print "\n\n"
    print "Loading and normalizing dataset: "+ datasets2[i]
    featureMatrix = sps.load_npz(datasets2[i]+'_Features.npz')
    #featureMatrix = min_max_scaler.fit_transform(featureMatrix)
    labels = np.load(datasets2[i]+'_Labels.npz')['arr_0']
    print "SGD on "+ datasets2[i]
    cm, ncm, sc = train_test_LR(featureMatrix, labels)
    print "    Normalized Confusion Matrix:"
    print "    ", ncm[0,0], ncm[0,1]
    print "    ", ncm[1,0], ncm[1,1]
    print "    Accuracy: ", sc
    results.append([datasets2[i], cm.copy(), ncm.copy(), sc])




Loading and normalizing dataset: ./Datasets_City/London_UkReviews_Thresholded_BOW3
SGD on ./Datasets_City/London_UkReviews_Thresholded_BOW3
    Normalized Confusion Matrix:
     0.630652868762 0.369347131238
     0.0374575795463 0.962542420454
    Accuracy:  0.814394688515



Loading and normalizing dataset: ./Datasets_City/London_UkReviews_Thresholded_BOW123
SGD on ./Datasets_City/London_UkReviews_Thresholded_BOW123
    Normalized Confusion Matrix:
     0.963131744675 0.0368682553245
     0.0568753030032 0.943124696997
    Accuracy:  0.952055375053



Loading and normalizing dataset: ./Datasets_City/London_UkReviews_Raw_BOW3
SGD on ./Datasets_City/London_UkReviews_Raw_BOW3
    Normalized Confusion Matrix:
     0.457767507999 0.542232492001
     0.0141921978392 0.985807802161
    Accuracy:  0.722043168278



Loading and normalizing dataset: ./Datasets_City/London_UkReviews_Raw_BOW123
SGD on ./Datasets_City/London_UkReviews_Raw_BOW123
    Normalized Confusion Matrix:
     0.957891930