In [91]:
import pandas as pd
import plotly
import plotly.plotly as py
from plotly.offline import init_notebook_mode
import plotly.graph_objs as go

from sklearn.preprocessing import MinMaxScaler

import numpy as np

init_notebook_mode(connected=True)

ModuleNotFoundError: No module named 'numpy.linal'

In [66]:
cancer_df = pd.read_csv('cancer.csv', sep=',', header=0)
spam_df   = pd.read_csv('spam.csv'  , sep=',', header=0)

cancer_df['label'] = cancer_df['label'].apply(lambda x: 1 if x == 'M' else 0)

def find_k_nearest(df, point, k):
    return df.sub(np.array(point)).pow(2).sum(1).pow(0.5).sort_values()[:k]

class kNN:
    def __init__(self, k):
        self.k = k
    
    def learn(self, df):
        self.data_set = df
    
    def predict(self, point):
        nearest = find_k_nearest(self.data_set.drop(columns=['label']), point, self.k)
        zeros_cnt = 0
        ones_cnt  = 0
        last_is_one = None
        for i in range(len(nearest.index)):
            value = self.data_set.at[nearest.index[i], 'label']
            assert value == 0 or value == 1
            if value:
                ones_cnt += 1; last_is_one = True
            else:
                zeros_cnt += 1; last_is_one = False
        assert last_is_one != None
        if zeros_cnt > ones_cnt:
            return 0
        if zeros_cnt < ones_cnt:
            return 1
        assert zeros_cnt == ones_cnt
        # As distances were sorted, let's pick the set that would win for k - 1
        return 0 if last_is_one else 1

def loo(df, alg):
    sum = 0
    for index in df.index:
        alg.learn(df.drop([index]))
        sum += 1 if alg.predict(df.loc[index][1:]) != df.at[index, 'label'] else 0
    return sum / len(df.index)

rang = list(range(1, 11))
cancer_knn_loos = [loo(cancer_df, kNN(k)) for k in rang]
smap_knn_loos = [loo(spam_df, kNN(k)) for k in rang]

In [73]:
cancer_knn_plot = go.Scatter(
    y = cancer_knn_loos,
    x = rang,
    mode = 'lines+markers',
    name = 'lines+markers'
)

plotly.offline.iplot([cancer_knn_plot], filename='line-mode')

In [75]:
spam_knn_plot = go.Scatter(
    y = smap_knn_loos,
    x = rang,
    mode = 'lines+markers',
    name = 'lines+markers'
)

plotly.offline.iplot([spam_knn_plot], filename='line-mode')

In [79]:
scaler = MinMaxScaler()

normalized_cancer_df = pd.DataFrame(scaler.fit_transform(cancer_df), columns=cancer_df.columns)
normalized_spam_df   = pd.DataFrame(scaler.fit_transform(spam_df), columns=spam_df.columns)

norm_cancer_knn_loos = [loo(normalized_cancer_df, kNN(k)) for k in rang]
norm_smap_knn_loos = [loo(normalized_spam_df, kNN(k)) for k in rang]


Data with input dtype int64, float64 were all converted to float64 by MinMaxScaler.


Data with input dtype int64, float64 were all converted to float64 by MinMaxScaler.



In [80]:
norm_cancer_knn_plot = go.Scatter(
    y = norm_cancer_knn_loos,
    x = rang,
    mode = 'lines+markers',
    name = 'lines+markers'
)

plotly.offline.iplot([norm_cancer_knn_plot], filename='line-mode')

In [83]:
norm_spam_knn_plot = go.Scatter(
    y = norm_smap_knn_loos,
    x = rang,
    mode = 'lines+markers',
    name = 'lines+markers'
)

plotly.offline.iplot([norm_spam_knn_plot], filename='line-mode')

In [84]:
blobs_df = pd.read_csv('blobs.csv', sep=',', header=0)

In [158]:
def closest_to(centers, point):
    cluster = -1
    mi = 0
    k = len(centers)
    for j in range(k):
        distance = np.linalg.norm(point - centers[j])
        if cluster == -1 or mi > distance:
            mi = distance
            cluster = j
    assert cluster != -1
    return cluster
        

def kMeans(df, k, iters=1000):
    dim = len(df.columns)
    maxvals = np.array([df[col].max() for col in df.columns])
    minvals = np.array([df[col].min() for col in df.columns])
    centers = [np.random.uniform(minvals, maxvals, dim) for _ in range(k)]
    clusters = None
    clusters_cnt = None
    for _ in range(iters):
        clusters = [np.zeros(dim) for _ in range(k)]
        clusters_cnt = [0] * k
        for i in range(len(df.index)):
            cluster = closest_to(centers, np.array(df.iloc[i]))
            clusters[cluster] += np.array(df.iloc[i])
            clusters_cnt[cluster] += 1
        for i in range(k):
            if clusters_cnt[i] == 0:
                centers[i] = np.random.uniform(minvals, maxvals, dim)
                continue
            centers[i] = clusters[i] / clusters_cnt[i]
    return centers

centers = []
for clusters in range(2, 6):
    centers.append(kMeans(blobs_df, clusters, iters=1000))

[[array([-0.94880142, -0.81221688]), array([0.05259619, 0.76408884])], [array([0.76083225, 0.68789107]), array([-0.92132022,  0.82542778]), array([-0.88085452, -0.92275922])], [array([-1.02914579,  0.95728066]), array([0.9459667 , 0.92504565]), array([-0.0858727 , -0.07315934]), array([-1.12007817, -1.11793824])], [array([-0.09641996, -0.11090372]), array([-1.07107738,  0.95697078]), array([-1.12007817, -1.11793824]), array([1.18367001, 0.79811533]), array([0.49005657, 1.04830909])]]


In [159]:
for i, cntrs in enumerate(centers):
    i += 2
    kMeans_clusters = []
    for j in range(i):
        mask = pd.Series([closest_to(cntrs, np.array(blobs_df.iloc[k])) == j for k in range(len(blobs_df.index))])
        kMeans_clusters.append(go.Scatter(
            y = blobs_df[mask]['Y'],
            x = blobs_df[mask]['X'],
            mode = 'markers',
            name = 'cluster' + str(j)
        ))
        
    plotly.offline.iplot(kMeans_clusters, filename='line-mode')

In [355]:
def dbscan(df, eps, minpts):
    vs = []
    clust = [-1] * len(df.index)
    for i in range(len(df.index)):
        if find_k_nearest(df, df.iloc[i], minpts + 1).iloc[minpts] < eps:
            vs.append(i)
    def dfs(u, col):
        clust[u] = col
        for v in vs:
            if clust[v] != -1:
                continue
            if np.linalg.norm(np.array(df.iloc[u]) - np.array(df.iloc[v])) < eps:
                dfs(v, col)
            
    emp = 0      
    for v in vs:
        if clust[v] != -1:
            continue
        dfs(v, emp)
        emp += 1
    return clust

dbscan_clusters = [
    dbscan(blobs_df, 0.5, 40),
    dbscan(blobs_df, 0.5, 43),
    dbscan(blobs_df, 0.25, 10),
    dbscan(blobs_df, 0.2, 10),
]

In [356]:
for cluster in dbscan_clusters:
    dbscan_clusters_plot = []
    for j in range(-1, 6):
        mask = pd.Series([cluster[k] == j for k in range(len(blobs_df.index))])
        dbscan_clusters_plot.append(go.Scatter(
            y = blobs_df[mask]['Y'],
            x = blobs_df[mask]['X'],
            mode = 'markers',
            name = 'cluster' + str(j)
        ))

    plotly.offline.iplot(dbscan_clusters_plot, filename='line-mode')

In [359]:
def agglomerative_clusterisation_step(df, clusters):
    def cluster_dists(df, a, b):
        dist = -1
        for i in a:
            for j in b:
                dist = max(dist, np.linalg.norm(np.array(df.iloc[a]) - np.array(df.iloc[b])))
        return dist
        
    sz = len(clusters)
    a = -1
    b = -1
    dist = -1
    cluster = [[i] for i in range(sz)]
    for i in range(sz):
        for j in range(i + 1, sz):
            new_dist = cluster_dists(df, cluster[i], cluster[j]) 
            if dist == -1 or new_dist < dist:
                a = i; b = j
                dist = new_dist
    assert dist != -1 and a != -1 and b != -1
    new_clusters = []
    for i, clust in enumerate(clusters):
        if i != a and i != b:
            new_clusters.append(clust)
        if i == a:
            new_clusters.append(clust + clusters[b])
    return new_clusters
    
def agglomerative_clusterisation(df, k):
    sz = len(df.index)
    clusters = [[i] for i in range(sz)]
    for _ in range(sz - k):
        clusters = agglomerative_clusterisation_step(df, clusters)

agglomerative_clusters5 = agglomerative_clusterisation(blobs_df, 5)
agglomerative_clusters4 = agglomerative_clusterisation_step(blobs_df, agglomerative_clusters5)
agglomerative_clusters3 = agglomerative_clusterisation_step(blobs_df, agglomerative_clusters4)
agglomerative_clusters2 = agglomerative_clusterisation_step(blobs_df, agglomerative_clusters3)

agglomerative_clusters = [
    agglomerative_clusters2,
    agglomerative_clusters3,
    agglomerative_clusters4,
    agglomerative_clusters5,
]

KeyboardInterrupt: 

In [360]:
for cluster in agglomerative_clusters:
    agglomerative_clusters_plot = []
    for j in range(2, 6):
        mask = pd.Series([k in cluster[j] for k in range(len(blobs_df.index))])
        agglomerative_clusters_plot.append(go.Scatter(
            y = blobs_df[mask]['Y'],
            x = blobs_df[mask]['X'],
            mode = 'markers',
            name = 'cluster' + str(j)
        ))

    plotly.offline.iplot(agglomerative_clusters_plot, filename='line-mode')

NameError: name 'agglomerative_clusters' is not defined

In [370]:
def split_dataset(df, train_frac):
    # Shuffle
    df.sample(frac=1).reset_index(drop=True)
    sz = len(df.index)
    return df[:int(sz * train_frac)], df[int(sz * train_frac):]

spam_train_set, spam_test_set = split_dataset(spam_df, 0.8)
cancer_train_set, cancer_test_set = split_dataset(cancer_df, 0.8)

In [372]:
print("Spam M proportion: in training set: {} ({} out of {}), in validation set: {} ({} out of {})"
      .format(spam_train_set.sum().at['label'] / len(spam_train_set.index), spam_train_set.sum().at['label'], len(spam_train_set.index), 
              spam_test_set.sum().at['label'] / len(spam_test_set.index), spam_test_set.sum().at['label'], len(spam_test_set.index)))


print("Cancer 1 proportion: in training set: {} ({} out of {}), in validation set: {} ({} out of {})"
      .format(cancer_train_set.sum().at['label'] / len(cancer_train_set.index), cancer_train_set.sum().at['label'], len(cancer_train_set.index), 
              cancer_test_set.sum().at['label'] / len(cancer_test_set.index), cancer_test_set.sum().at['label'], len(cancer_test_set.index)))


Spam M proportion: in training set: 0.49266304347826084 (1813.0 out of 3680), in validation set: 0.0 (0.0 out of 921)
Cancer 1 proportion: in training set: 0.4087912087912088 (186.0 out of 455), in validation set: 0.22807017543859648 (26.0 out of 114)


In [None]:
def roc_features(df):
    for column in df.drop(columns=['label']):
        borders = sorted(df[column].tolist())
        for border in borders:
            df.query("{}<{}".format(column, border))['label'].sum().at['label']