# Loading data (same as first week)

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('possum.csv')
df

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,3,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,4,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,5,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,100,7,other,m,1.0,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0
100,101,7,other,m,1.0,88.6,54.7,82.5,39.0,64.4,48.0,14.0,25.0,33.0
101,102,7,other,f,6.0,92.4,55.0,89.0,38.0,63.5,45.4,13.0,25.0,30.0
102,103,7,other,m,4.0,91.5,55.2,82.5,36.5,62.9,45.9,15.4,25.0,29.0


In [2]:
from sklearn.metrics.pairwise import euclidean_distances
type(euclidean_distances(df[['skullw','totlngth']]))

numpy.ndarray

In [3]:
distanceMatrix = pd.DataFrame(euclidean_distances(df[['skullw','totlngth']]), index=df['case'], columns=df['case'])

The resulting distance matrix is stored in a Pandas DataFrame with the indices and columns set to the 'case' column of the original df DataFrame. This allows us to easily reference the distance between any two data points in the matrix.


In [4]:
distanceMatrix.loc[:,3]

case
1       6.512296
2       4.664762
3       0.000000
4       4.545327
5      10.662551
         ...    
100    14.560220
101    14.038875
102     8.200610
103    13.857850
104     6.500769
Name: 3, Length: 104, dtype: float64

# Using KNN with scikit-learn

In [7]:
from sklearn.neighbors import KNeighborsClassifier

training = df.iloc[2:20]
testing = df.drop(training.index)
features_col = ['hdlngth','skullw','totlngth','taill']
target_col = 'sex'

In [8]:
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(training[features_col], training[target_col])


In [9]:
pd.Series((classifier.predict(testing[features_col])))

0     m
1     f
2     f
3     f
4     f
     ..
81    m
82    m
83    f
84    m
85    f
Length: 86, dtype: object

In [10]:
testing['predictionUsingKnn'] = classifier.predict(testing[features_col])
testing

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly,predictionUsingKnn
0,1,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0,m
1,2,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0,f
20,21,1,Vic,f,3.0,95.9,58.1,96.5,39.5,77.9,52.9,14.2,30.0,40.0,f
21,22,1,Vic,m,3.0,96.3,58.5,91.0,39.5,73.5,52.1,16.2,28.0,36.0,f
22,23,1,Vic,f,4.0,92.5,56.1,89.0,36.0,72.8,53.3,15.4,28.0,35.0,f
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,100,7,other,m,1.0,89.5,56.0,81.5,36.5,66.0,46.8,14.8,23.0,27.0,m
100,101,7,other,m,1.0,88.6,54.7,82.5,39.0,64.4,48.0,14.0,25.0,33.0,m
101,102,7,other,f,6.0,92.4,55.0,89.0,38.0,63.5,45.4,13.0,25.0,30.0,f
102,103,7,other,m,4.0,91.5,55.2,82.5,36.5,62.9,45.9,15.4,25.0,29.0,m


In [32]:
classifier = KNeighborsClassifier(n_neighbors=3)
classifier.fit(training[features_col], training[target_col])
classifier.predict(testing[features_col])

array(['m', 'f', 'f', 'f', 'f', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'f',
       'f', 'f', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'm', 'm',
       'm', 'm', 'f', 'm', 'm', 'f', 'm', 'f', 'f', 'f', 'm', 'm', 'f',
       'f', 'f', 'f', 'm', 'f', 'f', 'f', 'f', 'm', 'm', 'f', 'f', 'f',
       'f', 'm', 'm', 'm', 'f', 'f', 'm', 'm', 'm', 'm', 'f', 'm', 'm',
       'm', 'f', 'f', 'f', 'f', 'm', 'f', 'm', 'm', 'm', 'm', 'f', 'm',
       'm', 'm', 'f', 'm', 'm', 'f', 'm', 'f'], dtype=object)

In [33]:
prediction = classifier.predict(testing[features_col])

sklearn.metrics.confusion_matrix

# Using metrics in scikit-learn

In [30]:
#confusion_matrix
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
pd.DataFrame(confusion_matrix(testing['sex'], testing['predictionUsingKnn']))


Unnamed: 0,0,1
0,18,13
1,18,37


In [31]:
accuracy_score(testing[target_col], prediction)

0.4418604651162791

In [None]:
f1_score(testing['sex'], testing['predictionUsingKnn'], average='weighted')

0.5376522702104097