# Utilities

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import pandas as pd
import numpy as np
from collections import defaultdict
import json
import random

# Some useful utilities

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon, size=len(v))

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

def z_clip(xs, b):
    return [min(x, b) for x in xs]

def g_clip(v):
    n = np.linalg.norm(v, ord=2)
    if n > 5:
        return v / n
    else:
        return v
    
from sklearn.linear_model import LogisticRegression

In [2]:
# Prediction: take a model (theta) and a single example (xi) and return its predicted label
def predict(theta, xi):
    label = np.sign(xi @ theta)
    return label

# The loss function measures how good our model is. The training goal is to minimize the loss.
# This is the average logistic loss function.
def loss(theta, x, y, lambda_param=0):
    exponent = - y * (x.dot(theta))
    regularization = (lambda_param/2) * np.sum(theta*theta)
    return np.sum(np.log(1+np.exp(exponent))) / x.shape[0]

# This is the average gradient of the logistic loss
# The gradient is a vector that indicates in which direction the loss function is increasing fastest
def gradient(theta, x, y, lambda_param=0):
    exponent = y * (x.dot(theta))
    gradient_loss = - (np.transpose(x) @ (y / (1+np.exp(exponent)))) / (x.shape[0])
    regularization = lambda_param * theta
    return gradient_loss + regularization

# our measure of accuracy is just % correct of the test set
def accuracy(theta, X, y):
    return np.sum([predict(theta, xi) for xi in X] == y) / y.shape[0]

# This is gradient descent with a *learning rate* "eta"
def gradient_descent(X_train, y_train, iterations, status = False):
    theta = np.zeros(X_train.shape[1])
    eta = 1.0
    lambda_param = 0.001

    for i in range(iterations):
        theta = theta - eta * gradient(theta, X_train, y_train, lambda_param)
        if status and i % int(iterations / 5) == 0:
            print(f"Iteration {i}: loss = {loss(theta, X_train, y_train, lambda_param)}")

    return theta

# Load Data

In [3]:
fire_data = pd.read_csv("fire-data.csv")
fire_data.head()

Unnamed: 0,ALS Unit,Final Priority,Call Type Group,Original Priority,Priority,City,Unit Type,Fire Prevention District,Battalion,Supervisor District,...,Hospital DtTm,Location - Lng,Number of Alarms,Available DtTm,Unit sequence in call dispatch,Location - Lat,Call Date,Unit ID,Box,Address
0,0,0,0,0,0,0,0,0,0,0,...,,-122.426379,1,1451604000.0,2,37.776688,1451602800,432,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1451606000.0,-122.426379,1,1451607000.0,1,37.776688,1451602800,418,0,0
2,1,0,1,0,0,0,1,1,1,1,...,,-122.394748,1,1451603000.0,1,37.79448,1451602800,245,1,1
3,1,1,2,1,1,0,2,2,2,2,...,,-122.409572,1,1451604000.0,1,37.747553,1451602800,87,2,2
4,0,1,3,1,1,0,3,3,3,3,...,,-122.488371,1,1451604000.0,3,37.73115,1451602800,97,3,3


In [4]:
with open('fire-data-specs.json') as data_file:    
    specs = json.load(data_file)

specs

{'Unit ID': {'type': 'enum', 'count': 742},
 'Call Type': {'type': 'enum', 'count': 32},
 'Call Date': {'type': 'integer',
  'optional': False,
  'min': 955490400,
  'max': 1539122400},
 'Watch Date': {'type': 'integer',
  'optional': False,
  'min': 1451516400,
  'max': 1483225200},
 'Received DtTm': {'type': 'integer',
  'optional': False,
  'min': 1451602843,
  'max': 1483311589},
 'Entry DtTm': {'type': 'integer',
  'optional': False,
  'min': 1451602962,
  'max': 1483311743},
 'Dispatch DtTm': {'type': 'integer',
  'optional': False,
  'min': 1451603006,
  'max': 1483311791},
 'Response DtTm': {'type': 'integer',
  'optional': True,
  'min': 1451603122,
  'max': 1483311883},
 'On Scene DtTm': {'type': 'integer',
  'optional': True,
  'min': 1451603285,
  'max': 1483311990},
 'Transport DtTm': {'type': 'integer',
  'optional': True,
  'min': 1451604148,
  'max': 1483312891},
 'Hospital DtTm': {'type': 'integer',
  'optional': True,
  'min': 1451604963,
  'max': 1483314738},
 'Call 

In [5]:
from IPython.display import HTML
nan_removed = fire_data.dropna(axis='columns')

HTML(nan_removed.head().to_html())

Unnamed: 0,ALS Unit,Final Priority,Call Type Group,Original Priority,Priority,City,Unit Type,Fire Prevention District,Battalion,Supervisor District,Call Final Disposition,Zipcode of Incident,Call Type,Neighborhooods - Analysis Boundaries,Station Area,Watch Date,Received DtTm,Entry DtTm,Dispatch DtTm,Location - Lng,Number of Alarms,Unit sequence in call dispatch,Location - Lat,Call Date,Unit ID,Box,Address
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1451516400,1451602843,1451602962,1451603006,-122.426379,1,2,37.776688,1451602800,432,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1451516400,1451602843,1451602962,1451603410,-122.426379,1,1,37.776688,1451602800,418,0,0
2,1,0,1,0,0,0,1,1,1,1,1,1,0,1,1,1451516400,1451603027,1451603027,1451603129,-122.394748,1,1,37.79448,1451602800,245,1,1
3,1,1,2,1,1,0,2,2,2,2,2,2,8,2,2,1451516400,1451602977,1451603039,1451603051,-122.409572,1,1,37.747553,1451602800,87,2,2
4,0,1,3,1,1,0,3,3,3,3,3,3,1,3,3,1451516400,1451602982,1451603075,1451603080,-122.488371,1,3,37.73115,1451602800,97,3,3


# Marginals

In [6]:
for column in fire_data:
    print(f"{column}: \t{specs[column]}")

ALS Unit: 	{'type': 'enum', 'count': 2}
Final Priority: 	{'type': 'enum', 'count': 2}
Call Type Group: 	{'type': 'enum', 'count': 5}
Original Priority: 	{'type': 'enum', 'count': 8}
Priority: 	{'type': 'enum', 'count': 8}
City: 	{'type': 'enum', 'count': 9}
Unit Type: 	{'type': 'enum', 'count': 10}
Fire Prevention District: 	{'type': 'enum', 'count': 11}
Battalion: 	{'type': 'enum', 'count': 11}
Supervisor District: 	{'type': 'enum', 'count': 12}
Call Final Disposition: 	{'type': 'enum', 'count': 15}
Zipcode of Incident: 	{'type': 'enum', 'count': 28}
Call Type: 	{'type': 'enum', 'count': 32}
Neighborhooods - Analysis Boundaries: 	{'type': 'enum', 'count': 42}
Station Area: 	{'type': 'enum', 'count': 46}
Watch Date: 	{'type': 'integer', 'optional': False, 'min': 1451516400, 'max': 1483225200}
Received DtTm: 	{'type': 'integer', 'optional': False, 'min': 1451602843, 'max': 1483311589}
Entry DtTm: 	{'type': 'integer', 'optional': False, 'min': 1451602962, 'max': 1483311743}
Dispatch DtTm

In [7]:
fire_data['Priority'].value_counts()

1    202016
0     88995
2     13213
4       798
3        82
5        23
6         4
7         2
Name: Priority, dtype: int64

In [8]:
dp_counts = fire_data['Priority'].value_counts().map(lambda x: laplace_mech(x, 1, 1))
dp_counts

1    202016.781648
0     88993.124462
2     13209.238078
4       798.925432
3        84.150111
5        20.198234
6         4.399958
7         3.628825
Name: Priority, dtype: float64

In [9]:
probs = dp_counts / np.sum(dp_counts)
probs

1    0.662067
0    0.291656
2    0.043290
4    0.002618
3    0.000276
5    0.000066
6    0.000014
7    0.000012
Name: Priority, dtype: float64

In [10]:
[np.random.choice(list(probs.index), p=list(probs)) for i in range(10)]

[1, 0, 1, 2, 1, 0, 1, 1, 1, 0]

In [11]:
enum_cols = []
for col in fire_data:
    if specs[col]['type'] == 'enum':
        enum_cols.append(col)

display(enum_cols)

marginals = {}
for col in enum_cols:
    dp_counts = fire_data[col].value_counts().map(lambda x: max(laplace_mech(x, 1, 1), 0))
    probs = dp_counts / np.sum(dp_counts)
    marginals[col] = probs

marginals

['ALS Unit',
 'Final Priority',
 'Call Type Group',
 'Original Priority',
 'Priority',
 'City',
 'Unit Type',
 'Fire Prevention District',
 'Battalion',
 'Supervisor District',
 'Call Final Disposition',
 'Zipcode of Incident',
 'Call Type',
 'Neighborhooods - Analysis Boundaries',
 'Station Area',
 'Unit ID',
 'Box',
 'Address']

{'ALS Unit': 1    0.642531
 0    0.357469
 Name: ALS Unit, dtype: float64, 'Final Priority': 1    0.705449
 0    0.294551
 Name: Final Priority, dtype: float64, 'Call Type Group': 0    0.492488
 1    0.240238
 3    0.234771
 2    0.031300
 4    0.001204
 Name: Call Type Group, dtype: float64, 'Original Priority': 1    0.649585
 0    0.323234
 3    0.020485
 4    0.005894
 5    0.000327
 2    0.000305
 6    0.000160
 7    0.000010
 Name: Original Priority, dtype: float64, 'Priority': 1    0.662058
 0    0.291662
 2    0.043303
 4    0.002614
 3    0.000265
 5    0.000077
 6    0.000014
 7    0.000006
 Name: Priority, dtype: float64, 'City': 0    0.985741
 2    0.005164
 1    0.004371
 3    0.002534
 7    0.000735
 5    0.000609
 4    0.000579
 6    0.000253
 8    0.000016
 Name: City, dtype: float64, 'Unit Type': 2    0.380049
 1    0.300766
 0    0.105952
 4    0.098267
 3    0.068454
 6    0.025013
 7    0.015282
 5    0.004079
 9    0.001185
 8    0.000952
 Name: Unit Type, dtype: fl

In [12]:
def gen_row():
    row = []
    for col in enum_cols:
        probs = marginals[col]
        row.append(np.random.choice(list(probs.index), p=list(probs)))
    return row

display(len(enum_cols))
[gen_row() for i in range(1000)]

18

[[1, 1, 0, 1, 1, 0, 4, 5, 0, 0, 3, 10, 6, 7, 11, 333, 744, 3613],
 [0, 0, 1, 1, 1, 0, 1, 6, 0, 2, 0, 1, 6, 16, 41, 176, 358, 582],
 [1, 1, 0, 1, 1, 0, 2, 4, 10, 2, 0, 5, 0, 4, 34, 8, 775, 10730],
 [0, 1, 3, 0, 1, 0, 1, 2, 0, 8, 0, 0, 0, 4, 0, 328, 946, 10888],
 [1, 1, 0, 1, 0, 0, 2, 9, 4, 4, 0, 2, 0, 36, 17, 404, 722, 2134],
 [1, 1, 0, 1, 0, 0, 0, 0, 6, 1, 1, 7, 8, 24, 29, 62, 16, 38],
 [0, 1, 3, 1, 1, 0, 2, 0, 7, 2, 6, 15, 0, 6, 0, 0, 133, 1688],
 [1, 1, 3, 4, 1, 0, 2, 1, 0, 4, 3, 8, 0, 16, 39, 133, 328, 103],
 [0, 1, 0, 1, 0, 0, 2, 5, 10, 5, 2, 10, 1, 6, 7, 32, 1472, 2440],
 [1, 1, 0, 0, 1, 0, 2, 10, 4, 7, 0, 6, 0, 1, 14, 321, 573, 4585],
 [1, 1, 3, 0, 1, 0, 2, 6, 2, 4, 0, 0, 0, 6, 7, 321, 1449, 7701],
 [1, 1, 0, 1, 1, 0, 2, 9, 7, 4, 0, 12, 0, 17, 4, 7, 1506, 11378],
 [0, 1, 1, 1, 0, 0, 2, 2, 10, 4, 5, 16, 12, 1, 43, 53, 878, 501],
 [1, 0, 1, 1, 1, 0, 2, 2, 4, 4, 3, 8, 0, 26, 29, 18, 139, 1667],
 [1, 1, 0, 0, 1, 0, 4, 2, 7, 0, 0, 24, 0, 4, 9, 67, 316, 7750],
 [1, 1, 0, 1, 2, 0, 2, 1,

# Marginals for Floats

In [13]:
fire_data['Location - Lat'].value_counts()

37.786117    1750
37.777624    1368
37.784346     991
37.781146     988
37.785025     932
37.781865     916
37.781119     902
37.784206     877
37.779531     746
37.783386     720
37.786005     696
37.783834     694
37.781537     685
37.765051     642
37.781914     636
37.785081     632
37.784807     631
37.764373     627
37.780348     617
37.784091     611
37.784299     608
37.780479     602
37.757308     600
37.762744     588
37.777712     576
37.783163     569
37.808050     560
37.779944     557
37.783839     542
37.784396     535
             ... 
37.741562       1
37.773683       1
37.775693       1
37.720401       1
37.801223       1
37.767536       1
37.738286       1
37.743698       1
37.755053       1
37.795058       1
37.754935       1
37.756236       1
37.718890       1
37.734455       1
37.787665       1
37.791652       1
37.785383       1
37.768897       1
37.777609       1
37.773457       1
37.732882       1
37.791963       1
37.798937       1
37.786979       1
37.711457 

In [14]:
binned_data = pd.cut(fire_data['Location - Lat'], 10)
binned_data.value_counts()

(37.759, 37.783]    118330
(37.783, 37.807]     99529
(37.736, 37.759]     40886
(37.712, 37.736]     36604
(37.807, 37.831]      4923
(37.688, 37.712]      4395
(37.617, 37.641]       366
(37.831, 37.854]       100
(37.664, 37.688]         0
(37.641, 37.664]         0
Name: Location - Lat, dtype: int64

In [15]:
dp_counts = binned_data.value_counts().map(lambda x: max(laplace_mech(x, 1, 1),0))
display(dp_counts)
probs = dp_counts / np.sum(dp_counts)
display(probs)

# pick an interval and then uniform within it
def gen_float(probs):
    interval = np.random.choice(list(probs.index), p=list(probs))
    return np.random.uniform(interval.left, interval.right)

[gen_float(probs) for i in range(10)]

(37.759, 37.783]    118327.231290
(37.783, 37.807]     99529.805019
(37.736, 37.759]     40885.836285
(37.712, 37.736]     36603.586100
(37.807, 37.831]      4922.418523
(37.688, 37.712]      4394.466171
(37.617, 37.641]       366.266056
(37.831, 37.854]        99.413400
(37.664, 37.688]         0.198799
(37.641, 37.664]         0.573251
Name: Location - Lat, dtype: float64

(37.759, 37.783]    3.877931e-01
(37.783, 37.807]    3.261884e-01
(37.736, 37.759]    1.339949e-01
(37.712, 37.736]    1.199607e-01
(37.807, 37.831]    1.613221e-02
(37.688, 37.712]    1.440196e-02
(37.617, 37.641]    1.200361e-03
(37.831, 37.854]    3.258069e-04
(37.664, 37.688]    6.515231e-07
(37.641, 37.664]    1.878713e-06
Name: Location - Lat, dtype: float64

[37.75570149844197,
 37.75434550876283,
 37.797101486910634,
 37.77865659578286,
 37.77978695173382,
 37.770240785086884,
 37.77204426098983,
 37.81160535237951,
 37.71777875418119,
 37.752253810602966]

In [16]:
def gen_marginals():
    marginals = {}

    for col in fire_data:
        if specs[col]['type'] == 'enum':
            dp_counts = fire_data[col].value_counts().map(lambda x: max(laplace_mech(x, 1, 1), 0))
            probs = dp_counts / np.sum(dp_counts)
            marginals[col] = probs
        elif specs[col]['type'] == 'float' or specs[col]['type'] == 'integer':
            binned_data = pd.cut(fire_data[col], 20)
            dp_counts = binned_data.value_counts().map(lambda x: max(laplace_mech(x, 1, 1),0))
            probs = dp_counts / np.sum(dp_counts)
            marginals[col] = probs
            
    return marginals

# cost is len(cols)*epsilon
marginals = gen_marginals()

In [17]:
def gen_row(marginals):
    row = []
    for col in fire_data:
        probs = marginals[col]
        if specs[col]['type'] == 'enum':
            row.append(np.random.choice(list(probs.index), p=list(probs)))
        elif specs[col]['type'] == 'float':
            interval = np.random.choice(list(probs.index), p=list(probs))
            row.append(np.random.uniform(interval.left, interval.right))
        elif specs[col]['type'] == 'integer':
            interval = np.random.choice(list(probs.index), p=list(probs))
            row.append(np.random.randint(interval.left, interval.right))
    return row

[gen_row(marginals) for i in range(2)]

[[1,
  0,
  0,
  1,
  1,
  0,
  1,
  0,
  4,
  3,
  4,
  4,
  0,
  6,
  16,
  1476772075,
  1451659438,
  1457759540,
  1460134660,
  1459175664,
  1467266510,
  1457294180,
  1481336009,
  -122.43604717402086,
  0,
  1470406528,
  2,
  37.762234403446314,
  1474150365,
  454,
  546,
  4103],
 [1,
  1,
  3,
  1,
  1,
  0,
  1,
  1,
  0,
  1,
  0,
  2,
  1,
  37,
  17,
  1456926031,
  1480135058,
  1478911475,
  1468631521,
  1457334176,
  1475446601,
  1478920129,
  1479850453,
  -122.39187273101462,
  0,
  1469139230,
  2,
  37.78812655931625,
  1464785387,
  43,
  425,
  370]]

# Classification

In [18]:
for column in nan_removed:
    print(f"{column}: \t{specs[column]}")

ALS Unit: 	{'type': 'enum', 'count': 2}
Final Priority: 	{'type': 'enum', 'count': 2}
Call Type Group: 	{'type': 'enum', 'count': 5}
Original Priority: 	{'type': 'enum', 'count': 8}
Priority: 	{'type': 'enum', 'count': 8}
City: 	{'type': 'enum', 'count': 9}
Unit Type: 	{'type': 'enum', 'count': 10}
Fire Prevention District: 	{'type': 'enum', 'count': 11}
Battalion: 	{'type': 'enum', 'count': 11}
Supervisor District: 	{'type': 'enum', 'count': 12}
Call Final Disposition: 	{'type': 'enum', 'count': 15}
Zipcode of Incident: 	{'type': 'enum', 'count': 28}
Call Type: 	{'type': 'enum', 'count': 32}
Neighborhooods - Analysis Boundaries: 	{'type': 'enum', 'count': 42}
Station Area: 	{'type': 'enum', 'count': 46}
Watch Date: 	{'type': 'integer', 'optional': False, 'min': 1451516400, 'max': 1483225200}
Received DtTm: 	{'type': 'integer', 'optional': False, 'min': 1451602843, 'max': 1483311589}
Entry DtTm: 	{'type': 'integer', 'optional': False, 'min': 1451602962, 'max': 1483311743}
Dispatch DtTm

In [19]:
#training_cols = ['Priority', 'Location - Lng', 'Number of Alarms', 'Unit sequence in call dispatch',
#                'Location - Lat', 'Call Date', 'Unit ID', 'Box']
training_cols = ['Priority', 'Number of Alarms']

def gen_row():
    row = []
    for col in training_cols:
        typ = specs[col]['type']
        
        if typ == 'enum':
            row.append(random.randint(0, specs[col]['count']-1))
        elif typ == 'float':
            row.append(random.uniform(specs[col]['min'], specs[col]['max']))
        elif typ == 'integer':
            row.append(random.randint(specs[col]['min'], specs[col]['max']))

    return row

np.array([gen_row() for i in range(10)])

array([[5, 5],
       [2, 3],
       [4, 3],
       [2, 5],
       [5, 3],
       [5, 1],
       [0, 3],
       [0, 1],
       [7, 1],
       [5, 3]])

In [20]:
X = nan_removed.values
#display(nan_removed.head())

training_size = int(X.shape[0] * 0.8)

X_train = X[:training_size]
X_test = X[training_size:]
display(X_train[0][19])

-122.426379489125

In [21]:
def restrict_data(cols, label_col):
    df = nan_removed[[label_col]+cols]
    arr = df.values
    y = arr[:,0]
    X = arr[:,1:]
    return X, y

X, y = restrict_data(training_cols, 'Final Priority')
print (X[0])

def split_data(X, y):
    X_train = X[:training_size]
    X_test = X[training_size:]

    y_train = y[:training_size]
    y_test = y[training_size:]
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(X, y)

[0 1]


In [22]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(X_train,y_train)
display(np.mean(model.predict(X_test)))
display(np.mean(y_train))

print(f"Accuracy: {np.sum(model.predict(X_test) == y_test) / X_test.shape[0]}")

0.71094761335146739

0.70484953258010863

Accuracy: 0.9969030101430514


In [30]:
synth_X = np.array([gen_row() for i in range(100000)])
synth_y = model.predict(synth_X)
np.mean(synth_y)

22.39687

In [24]:
synth_model = LogisticRegression().fit(synth_X, synth_y)

# use the model trained on SYNTHETIC data to predict on the REAL TEST SET
print(f"Accuracy: {np.sum(synth_model.predict(X_test) == y_test) / X_test.shape[0]}")

Accuracy: 0.9969030101430514


In [25]:
# Try UNIT ID based on lat and lng
training_cols = ['Location - Lng', 'Location - Lat']
X, y = restrict_data(training_cols, 'Unit ID')
X_train, X_test, y_train, y_test = split_data(X, y)
print("Done with data, fitting model...")
model = LogisticRegression().fit(X_train,y_train)
print(f"Accuracy: {np.sum(model.predict(X_test) == y_test) / X_test.shape[0]}")

Done with data, fitting model...
Accuracy: 0.04284988611598145


# Clustering

In [26]:
nan_removed = fire_data.dropna(axis='columns')
X = nan_removed.values
display(nan_removed.head())

training_size = int(X.shape[0] * 0.8)

X_train = X[:training_size]
X_test = X[training_size:]

Unnamed: 0,ALS Unit,Final Priority,Call Type Group,Original Priority,Priority,City,Unit Type,Fire Prevention District,Battalion,Supervisor District,...,Entry DtTm,Dispatch DtTm,Location - Lng,Number of Alarms,Unit sequence in call dispatch,Location - Lat,Call Date,Unit ID,Box,Address
0,0,0,0,0,0,0,0,0,0,0,...,1451602962,1451603006,-122.426379,1,2,37.776688,1451602800,432,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1451602962,1451603410,-122.426379,1,1,37.776688,1451602800,418,0,0
2,1,0,1,0,0,0,1,1,1,1,...,1451603027,1451603129,-122.394748,1,1,37.79448,1451602800,245,1,1
3,1,1,2,1,1,0,2,2,2,2,...,1451603039,1451603051,-122.409572,1,1,37.747553,1451602800,87,2,2
4,0,1,3,1,1,0,3,3,3,3,...,1451603075,1451603080,-122.488371,1,3,37.73115,1451602800,97,3,3


In [27]:
from sklearn.cluster import KMeans

estimator = KMeans()
estimator.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [28]:
centers = estimator.cluster_centers_
#display(centers)
picked = estimator.predict(X_train)
avg_dist = np.mean([np.linalg.norm(centers[picked] - x, ord=2) for picked, x in zip(picked, X_train)])
print(f"Average distance for training set: {avg_dist}")

Average distance for training set: 1800584.495211353


In [29]:
picked = estimator.predict(X_test)
avg_dist = np.mean([np.linalg.norm(centers[picked] - x, ord=2) for picked, x in zip(picked, X_test)])
print(f"Average distance for test set: {avg_dist}")

Average distance for test set: 10564109.425820721
