In [16]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse as sps
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from pylab import rcParams

In [35]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    #rcParams['figure.figsize'] = 6, 6
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    #plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [7]:
def train_test_LR(featMap, trueLabels, testSize=.33):
    train, test, train_labels, test_labels = train_test_split(featMap,trueLabels,test_size=testSize,random_state=42)
    train = MaxAbsScaler().fit_transform(train)
    test = MaxAbsScaler().fit_transform(test)
    
    lreg = LogisticRegression(tol=0.001)
    lreg.fit(train,train_labels)
    test_predict = lreg.predict(test)
    
    confMat = confusion_matrix(test_labels,test_predict).astype(float)
    normConfMat = confMat.copy()
    normConfMat[0,:] /= confMat[0,:].sum()
    normConfMat[1,:] /= confMat[1,:].sum()
    return confMat, normConfMat, lreg.score(test, test_labels)

In [41]:
city = 'Barcelona_'
cname ="Barcelona"
prefix = './Datasets/'
datasets1 = [prefix + '3WordThresholdedEnglishReviews', 
             prefix + '3WordThresholdedEnglishReviews_k123', 
             prefix + 'EnglishReviews', 
             prefix + 'EnglishReviews_k123']

prefix = './Datasets_City/'+city
datasets2 = [prefix + 'UkReviews_Thresholded_BOW3', 
             prefix + 'UkReviews_Thresholded_BOW123', 
             prefix + 'UkReviews_Raw_BOW3', 
             prefix + 'UkReviews_Raw_BOW123']
figNames = [ 'Uk Reviews Thresholded BOW3', 
             'Uk Reviews Thresholded BOW123', 
             'Uk Reviews Raw BOW3', 
             'Uk Reviews Raw BOW123']

# For the Entire Dataset

In [42]:
results = []
for i in range(len(datasets1)):
    min_max_scaler = MaxAbsScaler()
    print "\n\n"
    print "Loading and normalizing dataset: "+ datasets2[i]
    featureMatrix = sps.load_npz(datasets2[i]+'_Features.npz')
    #featureMatrix = min_max_scaler.fit_transform(featureMatrix)
    labels = np.load(datasets2[i]+'_Labels.npz')['arr_0']
    print "LR on "+ datasets2[i]
    cm, ncm, sc = train_test_LR(featureMatrix, labels)
    #print "    Normalized Confusion Matrix:"
    #print "    ", ncm[0,0], ncm[0,1]
    #print "    ", ncm[1,0], ncm[1,1]
    #print "    Accuracy: ", sc
    results.append([datasets2[i], cm.copy(), ncm.copy(), sc])
    
    #plt.figure()
    plot_confusion_matrix(cm, classes=['Positive', 'Negative'], normalize=True,title='Normalized confusion matrix '+ figNames[i])
    #plt.show()
    plt.savefig(figNames[i]+'.png', bbox_inches='tight')
    #plt.show()
    plt.clf()




Loading and normalizing dataset: ./Datasets_City/Barcelona_UkReviews_Thresholded_BOW3
LR on ./Datasets_City/Barcelona_UkReviews_Thresholded_BOW3
Normalized confusion matrix
[[ 0.2771855   0.7228145 ]
 [ 0.01171439  0.98828561]]



Loading and normalizing dataset: ./Datasets_City/Barcelona_UkReviews_Thresholded_BOW123
LR on ./Datasets_City/Barcelona_UkReviews_Thresholded_BOW123
Normalized confusion matrix
[[ 0.93949893  0.06050107]
 [ 0.05243585  0.94756415]]



Loading and normalizing dataset: ./Datasets_City/Barcelona_UkReviews_Raw_BOW3
LR on ./Datasets_City/Barcelona_UkReviews_Raw_BOW3
Normalized confusion matrix
[[ 0.54112177  0.45887823]
 [ 0.02000584  0.97999416]]



Loading and normalizing dataset: ./Datasets_City/Barcelona_UkReviews_Raw_BOW123
LR on ./Datasets_City/Barcelona_UkReviews_Raw_BOW123
Normalized confusion matrix
[[ 0.9405696   0.0594304 ]
 [ 0.05724299  0.94275701]]


# For Cities

In [40]:
results = []
for i in range(len(datasets2)):
    min_max_scaler = MaxAbsScaler()
    print "\n\n"
    print "Loading and normalizing dataset: "+ datasets2[i]
    featureMatrix = sps.load_npz(datasets2[i]+'_Features.npz')
    #featureMatrix = min_max_scaler.fit_transform(featureMatrix)
    labels = np.load(datasets2[i]+'_Labels.npz')['arr_0']
    print "SGD on "+ datasets2[i]
    cm, ncm, sc = train_test_LR(featureMatrix, labels)
    print "    Normalized Confusion Matrix:"
    print "    ", ncm[0,0], ncm[0,1]
    print "    ", ncm[1,0], ncm[1,1]
    print "    Accuracy: ", sc
    results.append([datasets2[i], cm.copy(), ncm.copy(), sc])
    
    #plt.figure()
    plot_confusion_matrix(cm, classes=['Positive', 'Negative'], normalize=True,title=cname + ' Normalized confusion matrix '+ figNames[i])
    #plt.show()
    plt.savefig(cname + ' Normalized confusion matrix '+ figNames[i] + '.png', bbox_inches='tight')
    #plt.show()
    plt.clf()




Loading and normalizing dataset: ./Datasets_City/London_UkReviews_Thresholded_BOW3
SGD on ./Datasets_City/London_UkReviews_Thresholded_BOW3
    Normalized Confusion Matrix:
     0.693756131523 0.306243868477
     0.0505728356 0.9494271644
    Accuracy:  0.835301596271
Normalized confusion matrix
[[ 0.69375613  0.30624387]
 [ 0.05057284  0.94942716]]



Loading and normalizing dataset: ./Datasets_City/London_UkReviews_Thresholded_BOW123
SGD on ./Datasets_City/London_UkReviews_Thresholded_BOW123
    Normalized Confusion Matrix:
     0.963068451533 0.0369315484667
     0.0572580439387 0.942741956061
    Accuracy:  0.95181522814
Normalized confusion matrix
[[ 0.96306845  0.03693155]
 [ 0.05725804  0.94274196]]



Loading and normalizing dataset: ./Datasets_City/London_UkReviews_Raw_BOW3
SGD on ./Datasets_City/London_UkReviews_Raw_BOW3
    Normalized Confusion Matrix:
     0.749555634554 0.250444365446
     0.0432862034097 0.95671379659
    Accuracy:  0.853234957249
Normalized confusion 