In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### How can I identify similar Marathon performances?

In [2]:
df = pd.read_csv('results.csv')

In [3]:
# Data Cleaning

# Drop ctz and state column
df = df.drop(['ctz', 'state'], 1)

# Drop row in city where there's a null value
df = df.dropna(subset=['city'])

# Replace values that represent null
df = df.replace('-', np.nan)

# Convert types to numeric
df[['10k', '25k', '35k', '30k', '5k', 'half', '20k', '40k']] = df[['10k', '25k', '35k', '30k', '5k', 'half', '20k', '40k']].apply(pd.to_numeric)

# Impute null with means
df = df.fillna(df.mean())

# Create features
features = df.drop(['city', 'division', 'name','bib', 'overall', 'country', 'genderdiv'], 1)

# Make categorical variables numeric using label encoder
labelencoder = LabelEncoder()
labelencoder.fit(features['gender'])
features['gender'] = labelencoder.transform(features['gender'])

In [4]:
# Break into a set of features and a variable for the known outcome.
X = features.iloc[:, :11]
y = features.iloc[:, 11]

# Normalize features
X_norm = normalize(X)

In [5]:
# Data frame to store features and predicted cluster memberships.
ypred = pd.DataFrame()

# Create PCAS
X_pca = PCA(2).fit_transform(X)

# Split the data into four equally-sized samples. First we break it in half:
X_half1, X_half2, X_pcahalf1, X_pcahalf2 = train_test_split(
    X_norm,
    X_pca,
    test_size=0.5,
    random_state=42)

# Then we halve the halves.

X1, X2, X_pca1, X_pca2 = train_test_split(
    X_half1,
    X_pcahalf1,
    test_size=0.5,
    random_state=42)

X3, X4, X_pca3, X_pca4 = train_test_split(
    X_half2,
    X_pcahalf2,
    test_size=0.5,
    random_state=42)

# Pass a list of tuples and a counter that increments each time we go
# through the loop. The tuples are the data to be used by k-means,
# and the PCA-derived features for graphing. We use k-means to fit a
# model to the data, then store the predicted values and the two-feature
# PCA solution in the data frame.
for counter, data in enumerate([
    (X1, X_pca1),
    (X2, X_pca2),
    (X3, X_pca3),
    (X4, X_pca4)]):
    
    # Put the features into ypred.
    ypred['pca_f1' + '_sample' + str(counter)] = data[1][:, 0]
    ypred['pca_f2' + '_sample' + str(counter)] = data[1][:, 1]
    
    # Generate cluster predictions and store them for clusters 2 to 4.
    for nclust in range(2, 5):
        pred = KMeans(n_clusters=nclust, random_state=42).fit_predict(data[0])
        ypred['clust' + str(nclust) + '_sample' + str(counter)] = pred

# For each  number of clusters, plot the clusters using the
# pca features for each sample.
for cluster in range(2, 5):
    
    # Make a grid of subplots.
    f, axarr = plt.subplots(2, 2)
    
    # Make a plot for each sample.
    for i in range(4):
        
        # PCA-created features.
        x_sub = ypred['pca_f1_sample{}'.format(i)]
        y_sub = ypred['pca_f2_sample{}'.format(i)]
        
        # Cluster assignments.
        c = ypred['clust{}_sample{}'.format(cluster, i)]
        
        # Assign the subplot to its place on the grid.
        rows = int(np.floor(i / 2))
        cols = i % 2
        axarr[rows, cols].scatter(x_sub, y_sub, c=c)
        axarr[rows, cols].set_title('sample {}'.format(i))
        axarr[rows, cols].set_xlim([-.3, .3])
        axarr[rows, cols].set_ylim([-.3, .3])
    
    # Space out the plots so that the headings don't overlap axis values.
    plt.suptitle('{} Clusters'.format(cluster), fontsize=20)
    plt.tight_layout()
    plt.show()
    print('\n')


ValueError: Length of values does not match length of index

In [None]:
#x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42)
#x_train = x_train.values
#x_train

In [None]:
# Instantiate KMeans Cluster Algorithm
prediction = KMeans(n_clusters=3, random_state=42).fit_predict(X_pca)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(X_pca[:,0], X_pca[:,1], c = prediction)
plt.show()

In [None]:
from sklearn.cluster import MeanShift, estimate_bandwidth

# Here we set the bandwidth. This function automatically derives a bandwidth
# number based on an inspection of the distances among points in the data.
bandwidth = estimate_bandwidth(X_pca, quantile=0.2, n_samples=500)

# Declare and fit the model.
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X_pca)

# Extract cluster assignments for each data point.
labels = ms.labels_

# Coordinates of the cluster centers.
cluster_centers = ms.cluster_centers_

# Count our clusters.
n_clusters_ = len(np.unique(labels))

print("Number of estimated clusters: {}".format(n_clusters_))

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(X_pca[:,0], X_pca[:,1], c = labels)
plt.show()

Optimize clusters
    - Use elbow method to find optimum K value for k-means cluster algorithm
    
Evaluate the clusters
    - Compare K-means algorithm with means-shift algorithm by some metric

Explore the meaning behind the ismiliraity of clusters, annotating obervations
    - Identify Labels
    
Creat visualizations of similarities and variable relationship
    - Watch some youtube videos about visualisations to help guide your data
    
Describe findings in a report
    - Write up report