In [1]:
import os
from collections import Counter
import numpy as np
import scipy.sparse as sps
from scipy.linalg import pinv
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.utils.class_weight import compute_sample_weight

def save_array(array, filename, sep=',', subdir='data'):
    """Saves a Numpy array as a delimited text file.
    Args:
        array (Numpy.Array): Input array.
        filename (str): Output file name.
        sep (str): Delimiter.
        subdir (str): Parent directory path for output file.
    """
    tdir = os.path.join(os.getcwd(), os.pardir, subdir, filename)
    np.savetxt(fname=tdir, X=array, delimiter=sep, fmt='%.20f')

def save_dataset(df, filename, sep=',', subdir='data', header=True):
    """Saves Pandas data frame as a CSV file.
    Args:
        df (Pandas.DataFrame): Data frame.
        filename (str): Output file name.
        sep (str): Delimiter.
        subdir (str): Project directory to save output file.
        header (Boolean): Specify inclusion of header.
    """
    tdir = os.path.join(os.getcwd(), os.pardir, subdir, filename)
    df.to_csv(path_or_buf=tdir, sep=sep, header=header, index=False)

def get_abspath(filename, filepath):
    """Gets absolute path of specified file within the project directory. The
    filepath has to be a subdirectory within the main project directory.
    Args:
        filename (str): Name of specified file.
        filepath (str): Subdirectory of file.
    Returns:
        fullpath (str): Absolute filepath.
    """
    p = os.path.abspath(os.path.join(os.curdir, os.pardir))
    fullpath = os.path.join(p, filepath, filename)

    return fullpath

def balanced_accuracy(labels, predictions):
    """Modifies the standard accuracy scoring function to account for
    potential imbalances in class distributions.
    Args:
        labels (numpy.array): Actual class labels.
        predictions (numpy.array): Predicted class labels.
    Returns:
        Modified accuracy scoring function.
    """
    weights = compute_sample_weight('balanced', labels)
    return accuracy_score(labels, predictions, sample_weight=weights)

def cluster_acc(Y, clusterY):
    """Calculates accuracy of labels in each cluster by comparing to the
    actual Y labels.
    Args:
        Y (Numpy.Array): Actual labels.
        clusterY (Numpy.Array): Predicted labels per cluster.
    Returns:
        score (float): Accuracy score for given cluster labels.
    """
    assert Y.shape == clusterY.shape
    pred = np.empty_like(Y)
    for label in set(clusterY):
        mask = clusterY == label
        sub = Y[mask]
        target = Counter(sub).most_common(1)[0][0]
        pred[mask] = target
    return balanced_accuracy(Y, pred)

def reconstruction_error(projections, x):
    """Calculates reconstruction error on a given set of projected data based
    on the original dataset.
    Args:
        projections (Numpy.Array): Random matrix used for projections.
        x (Numpy.Array): Original dataset.
    Returns:
        errors (Numpy.Array): Reconstruction error.
    """
    W = projections.components_
    if sps.issparse(W):
        W = W.todense()
    p = pinv(W)
    reconstructed = np.dot(np.dot(p, W), x.T).T  # Unproject projected data
    errors = np.square(x - reconstructed)
    return np.nanmean(errors)

def pairwise_dist_corr(x1, x2):
    """Calculates the pairwise distance correlation between two arrays.
    Args:
        x1 (Numpy.Array): First array.
        x2 (Numpy.Array): Second array.
    Returns:
        Numpy.Array of pairwise distance correlations.
    """
    assert x1.shape[0] == x2.shape[0]

    d1 = pairwise_distances(x1)
    d2 = pairwise_distances(x2)
    return np.corrcoef(d1.ravel(),d2.ravel())[0,1]


In [2]:
from collections import defaultdict
from itertools import product
import timeit
import pandas as pd
import numpy as np
from sklearn.random_projection import SparseRandomProjection
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import seaborn


In [None]:
#MUSHROOM
np.random.seed(0)

mushroom_train = pd.read_csv('mushroom_train.csv')
df = mushroom_train.apply(pd.to_numeric) 

X_train_mushroom = df.drop('classp',axis=1)
y_train_mushroom = df['classp']

mushroomX = X_train_mushroom
mushroomY = y_train_mushroom

#gammaX = X_train_gamma
#gammaY = y_train_gamma

mDims = mushroomX.shape[1]
#gDims = gammaX.shape[1]

dims = range(1, mDims + 1)
name = 'mushroom'
X = mushroomX

re = defaultdict(dict)
pdc = defaultdict(dict)



In [None]:
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(X)
    re[dim][i] = reconstruction_error(rp, X)
    pdc[dim][i] = pairwise_dist_corr(rp.transform(X), X)

In [None]:
re = pd.DataFrame(pd.DataFrame(re).T.mean(axis=1))
re.rename(columns={0: 'recon_error'}, inplace=True)
pdc = pd.DataFrame(pd.DataFrame(pdc).T.mean(axis=1))
pdc.rename(columns={0: 'pairwise_dc'}, inplace=True)
metrics = pd.concat((re, pdc), axis=1) 
metrics['n'] = metrics.index
df = metrics

In [None]:
def generate_plots(name):
    """Plots reconstruction error and pairwise distance correlation as a
    function of number of components.
    Args:
        name (str): Dataset name.
    """
    df = metrics 
    # get figure and axes
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(8, 3))

    # plot metrics
    x = df['n']
    re = df['recon_error']
    pdc = df['pairwise_dc']
    ax1.plot(x, re, marker='.', color='g')
    ax1.set_title('Reconstruction Error ({})'.format(name))
    ax1.set_ylabel('Reconstruction error')
    ax1.set_xlabel('# Components')
    ax1.grid(color='grey', linestyle='dotted')

    ax2.plot(x, pdc, marker='.', color='b')
    ax2.set_title('Pairwise Distance Correlation ({})'.format(name))
    ax2.set_ylabel('Pairwise distance correlation')
    ax2.set_xlabel('# Components')
    ax2.grid(color='grey', linestyle='dotted')

    # change layout size, font size and width
    fig.tight_layout()
    for ax in fig.axes:
        ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label]
        for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels():
            item.set_fontsize(8)

    # save figure
    resdir = 'Assignment 3'
    plotpath = get_abspath('{}RP.png'.format(name), resdir)
    plt.savefig(plotpath)
    plt.clf()



In [None]:
generate_plots('mushroom')

dims = len(mushroomX.columns)
name = 'mushroom'

rp = SparseRandomProjection(random_state=0, n_components=dims)
res = rp.fit_transform(mushroomX)

    # save results file
resdir = 'Assignment 3'
resfile = get_abspath('{}_RP.csv'.format(name), resdir)
save_array(array=res, filename=resfile, subdir=resdir)

In [3]:
#UCI_Credit_Card
np.random.seed(0)

cc_train = pd.read_csv('cc_train.csv')
df = cc_train.apply(pd.to_numeric) 

X_train_cc = df.drop('data.default.payment.next.month',axis=1)
y_train_cc= df['data.default.payment.next.month']

ccX = X_train_cc
ccY = y_train_cc

mDims = ccX.shape[1]


dims = range(1, mDims + 1)
name = 'cc'
X = ccX

re = defaultdict(dict)
pdc = defaultdict(dict)

In [4]:
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(X)
    re[dim][i] = reconstruction_error(rp, X)
    pdc[dim][i] = pairwise_dist_corr(rp.transform(X), X)

In [5]:
re = pd.DataFrame(pd.DataFrame(re).T.mean(axis=1))
re.rename(columns={0: 'recon_error'}, inplace=True)
pdc = pd.DataFrame(pd.DataFrame(pdc).T.mean(axis=1))
pdc.rename(columns={0: 'pairwise_dc'}, inplace=True)
metrics = pd.concat((re, pdc), axis=1) 
metrics['n'] = metrics.index
df = metrics

In [6]:
def generate_plots(name):
    """Plots reconstruction error and pairwise distance correlation as a
    function of number of components.
    Args:
        name (str): Dataset name.
    """
    df = metrics 
    # get figure and axes
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(8, 3))

    # plot metrics
    x = df['n']
    re = df['recon_error']
    pdc = df['pairwise_dc']
    ax1.plot(x, re, marker='.', color='g')
    ax1.set_title('Reconstruction Error ({})'.format(name))
    ax1.set_ylabel('Reconstruction error')
    ax1.set_xlabel('# Components')
    ax1.grid(color='grey', linestyle='dotted')

    ax2.plot(x, pdc, marker='.', color='b')
    ax2.set_title('Pairwise Distance Correlation ({})'.format(name))
    ax2.set_ylabel('Pairwise distance correlation')
    ax2.set_xlabel('# Components')
    ax2.grid(color='grey', linestyle='dotted')

    # change layout size, font size and width
    fig.tight_layout()
    for ax in fig.axes:
        ax_items = [ax.title, ax.xaxis.label, ax.yaxis.label]
        for item in ax_items + ax.get_xticklabels() + ax.get_yticklabels():
            item.set_fontsize(8)

    # save figure
    resdir = 'Assignment 3'
    plotpath = get_abspath('{}RP.png'.format(name), resdir)
    plt.savefig(plotpath)
    plt.clf()



In [8]:
generate_plots('cc')

dims = len(ccX.columns)
name = 'cc'

rp = SparseRandomProjection(random_state=0, n_components=dims)
res = rp.fit_transform(ccX)

    # save results file
resdir = 'Assignment 3'
resfile = get_abspath('{}_RP.csv'.format(name), resdir)
save_array(array=res, filename=resfile, subdir=resdir)