Build knn-based cancer type classifier using gdc data.
Use the knn hyperparameters of the best classifier to draw the knn cell lines for PMDs.

**Dimensionality reduction:**<br>
- https://colah.github.io/posts/2014-10-Visualizing-MNIST/
- https://jlmelville.github.io/smallvis/mmds.html
- https://www.cs.toronto.edu/~hinton/csc2535/notes/lec11new.pdf

1. Input data
    - (we want dim-reduction methods that preserve distance proximity)
    - raw data lincs1000
    - pca
    - mds
    - sammon - emphesizes more the local rather the global structure
    - ae (up to 8 dims)
    - som (https://ieeexplore.ieee.org/document/5551813/) - preserves distance and proximity ()
2. Distance metric

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import sys

In [3]:
utils_path = os.path.abspath(os.path.join('..', 'utils_py'))
sys.path.append(utils_path)

In [4]:
from pilot1_imports import *
from utils import *
SEED = 0

In [5]:
from sklearn.datasets.mldata import fetch_mldata
import tempfile
test_data_home = tempfile.mkdtemp()

In [6]:
mnist = fetch_mldata('MNIST original', data_home='.')
xdata = mnist['data']
ydata = mnist['target']

In [7]:
print(xdata.shape)
print(ydata.shape)

(70000, 784)
(70000,)


### Utils

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA, FastICA
from sklearn.manifold import MDS
from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix, f1_score

In [9]:
def plot_confusion_matrix(y_true, y_pred, labels=None, title=None, savefig=True, img_name='confusion'):
    """ Create a confusion matrix for a classification results.
    Args:
        labels : list of label names
    """
    np_conf = confusion_matrix(y_true, y_pred)
    df_conf = pd.DataFrame(np_conf, index=labels, columns=labels)

    m = df_conf.shape[0]

    fontsize=25  # font size of labels (not in table numbers)
    plt.figure(figsize=(m, m))
    sns.set(font_scale=2.0)
    sns.heatmap(df_conf, annot=True, fmt='d', linewidths=0.9, cmap='Greens', linecolor='white')
    plt.ylabel('True label', fontsize=fontsize)
    plt.xlabel('Predicted label', fontsize=fontsize)
    if title:
        plt.title(title, fontsize=fontsize)

#     if savefig:
#         plt.savefig(img_name, bbox_inches='tight')

    return df_conf

### Dim-reduction

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA, FastICA
from sklearn.manifold import MDS
from sklearn.metrics import classification_report, precision_recall_fscore_support, confusion_matrix, f1_score

In [11]:
# from minisom import MiniSom

#### Create different classifiers

In [12]:
# Create different classifier
classifiers = [('minkowski (p=2)', KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', 
                                                        metric='minkowski', p=2, metric_params=None, n_jobs=-1)),
               ('minkowski (p=1)', KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', 
                                                        metric='minkowski', p=1, metric_params=None, n_jobs=-1)),
               ('chebyshev', KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', 
                                                  metric='chebyshev', metric_params=None, n_jobs=-1)),
               ('cosine', KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', 
                                               metric='cosine', metric_params=None, n_jobs=-1)),
              ]

#### Create different datasets

In [13]:
idx = np.random.permutation(range(xdata.shape[0]))
xdata = xdata[idx[:3000], :]
ydata = ydata[idx[:3000]].astype(np.int16)

In [14]:
np.bincount(ydata)

array([302, 351, 313, 308, 276, 291, 266, 321, 294, 278])

In [15]:
n_components=8

In [16]:
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
pca_obj = PCA(n_components=n_components, copy=True, whiten=False, svd_solver='auto', tol=0.0,
              iterated_power='auto', random_state=SEED)
xdata_pca = pca_obj.fit_transform(xdata.copy())
print(xdata_pca.shape)

(3000, 8)


In [17]:
pca_obj.explained_variance_ratio_.sum()

0.4430519713015967

In [18]:
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html
ica_obj = FastICA(n_components=n_components, algorithm='parallel', whiten=True, fun='logcosh',
                  fun_args=None, max_iter=800, tol=0.001, w_init=None, random_state=SEED)
xdata_ica = ica_obj.fit_transform(xdata.copy())
print(xdata_ica.shape)

(3000, 8)


In [19]:
# # http://scikit-learn.org/stable/modules/generated/sklearn.manifold.MDS.html
# mds_obj = MDS(n_components=2, metric=True, n_init=1, max_iter=100,
#               verbose=1, eps=0.001, n_jobs=-1, random_state=SEED, dissimilarity='euclidean')
# rna_mds = mds_obj.fit_transform(df_rna.copy())
# print(rna_mds.shape)

In [20]:
# Create different classifier
datasets = [('original', xdata), ('pca', xdata_pca), ('ica', xdata_ica)]
# datasets = [('pca', xdata_pca), ('ica', xdata_ica)]

#### Iterate

In [21]:
for ci, c in enumerate(classifiers):
    # Iter over classifiers
    cls_name = classifiers[ci][0]
    cls_obj = classifiers[ci][1]
    print('\n', cls_name)
    print('----------------')
    
    for di, d in enumerate(datasets):
        # Iter over datasets
        data_name = datasets[di][0]
        data = datasets[di][1].copy()
        print('  ', data_name)
        
        # Split dataset
        xtr, xte, ytr, yte = train_test_split(xdata, ydata, test_size=0.2,
                                              stratify=ydata, random_state=SEED, shuffle=True)
        y_true_classes = yte
        
        # Train kNN
        cls_obj.fit(xtr, ytr)
        
        # Compute class predictions
        y_pred_classes = cls_obj.predict(xte)
        
        # Compute scores
        scores = precision_recall_fscore_support(yte, y_pred_classes, average='micro')
        print('     precision={:.2f}, recall={:.2f}, f_beta={:.2f}'.format(scores[0], scores[1], scores[2]))
        


 minkowski (p=2)
----------------
   original
     precision=0.89, recall=0.89, f_beta=0.89
   pca
     precision=0.89, recall=0.89, f_beta=0.89
   ica
     precision=0.89, recall=0.89, f_beta=0.89

 minkowski (p=1)
----------------
   original
     precision=0.87, recall=0.87, f_beta=0.87
   pca
     precision=0.87, recall=0.87, f_beta=0.87
   ica
     precision=0.87, recall=0.87, f_beta=0.87

 chebyshev
----------------
   original
     precision=0.55, recall=0.55, f_beta=0.55
   pca
     precision=0.55, recall=0.55, f_beta=0.55
   ica
     precision=0.55, recall=0.55, f_beta=0.55

 cosine
----------------
   original
     precision=0.90, recall=0.90, f_beta=0.90
   pca
     precision=0.90, recall=0.90, f_beta=0.90
   ica
     precision=0.90, recall=0.90, f_beta=0.90
