In [140]:
import numpy as np
import pandas as pd
data = pd.read_csv("resources/postcodes_sampled.csv")

In [49]:
data

Unnamed: 0,postcode,sector,easting,northing,localAuthority,altitude,soilType,riskLabel,medianPrice,latitude,longitude
0,PO7 8PR,PO7 8,469395.0,108803.0,Havant,30,Planosols,1,233500.0,50.873839,-1.013701
1,SO17 1NS,SO17 1,442771.0,114321.0,Southampton,20,Unsurveyed/Urban,1,291800.0,50.926041,-1.391427
2,TN28 8XN,TN28 8,606861.0,124689.0,Folkestone and Hythe,10,Cambisols,1,326500.0,50.983674,0.947249
3,KT3 4JW,KT3 4,521649.0,168848.0,Kingston upon Thames,20,Unsurveyed/Urban,1,875200.0,51.404896,-0.251032
4,CT2 8AA,CT2 8,614532.0,158074.0,Canterbury,10,Unsurveyed/Urban,10,303500.0,51.280645,1.076249
...,...,...,...,...,...,...,...,...,...,...,...
39995,SE22 8BE,SE22 8,533403.0,175417.0,Southwark,20,Unsurveyed/Urban,1,674300.0,51.461288,-0.079666
39996,SW10 0JB,SW10 0,526500.0,177609.0,Kensington and Chelsea,0,Unsurveyed/Urban,1,884900.0,51.482573,-0.178194
39997,HP21 9QS,HP21 9,482072.0,211761.0,Buckinghamshire,90,Cambisols,1,302000.0,51.797778,-0.809831
39998,TN15 8NY,TN15 8,560877.0,157522.0,Tonbridge and Malling,90,Luvisols,1,190000.0,51.293367,0.307404


In [42]:
def nearest_localAuthority(sample, easting, northing):
    """Return the nearest station (by 2d Euclidean distance) and the tide at that station

    Parameters
    ----------

    sample: dataframe
        sample data
    easting: list of floats
        OSGB36 Easting
    northing: list of floats
        OSGB36 Northing

    Returns
    -------

    pandas.Series
            Series of localAuthority indexed by postcodes.
    """

    data = sample
    nearest_authority = pd.DataFrame()
    #calculate 2d Euclidean distance
    for i in range(len(easting)):
        distance = []
        for j in range(len(data)):
            d = np.sqrt((easting[i]-data["easting"][j])**2+(northing[i]-data["northing"][j])**2)            
            distance.append(d)
            
        index = np.argmin(distance)
        nearest_authority = pd.concat([nearest_authority, pd.DataFrame(data.iloc[index]).T])
    
    nearest_authority.set_index(["postcode"],inplace = True)
    
    return nearest_authority[["localAuthority"]]



In [45]:
easting = [442771.0,521649.0,533403.0]
northing = [114321.0,168848.0,175417.0]

In [47]:
nearest_localAuthority(data, easting, northing)

Unnamed: 0_level_0,localAuthority
postcode,Unnamed: 1_level_1
SO17 1NS,Southampton
KT3 4JW,Kingston upon Thames
SE22 8BE,Southwark


In [81]:
X = data[["easting","northing"]]
y = data["localAuthority"]

In [82]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

encoder = LabelEncoder().fit(y)
y = encoder.transform(y)

array([41, 72, 30, ..., 13, 83, 14])

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7,random_state=42)

In [126]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
knn_model = KNeighborsClassifier(n_neighbors = 3)
grid = {"n_neighbors":[1,3,5,7,9]}

search1 = GridSearchCV(knn_model, grid, cv=5, scoring="accuracy")
search1.fit(X, y)

In [137]:
model1 = search1.best_estimator_
search1.best_score_

0.9823999999999999

In [132]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
grid = {"n_estimators":[15,17,20,25]}

search2 = GridSearchCV(rf_model, grid, cv=5, scoring="accuracy")
search2.fit(X, y)

In [133]:
model2 = search2.best_estimator_
search2.best_score_

0.9776

In [134]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
grid = {"max_depth":[1,3,5,7,9,11,13]}

search3 = GridSearchCV(tree, grid, cv=5, scoring="accuracy")
search3.fit(X_train,y_train)

In [135]:
model3 = search3.best_estimator_
search2.best_score_

0.9776

In [138]:
y_pred = model1.predict(X)

In [139]:
prediction = encoder.inverse_transform(y_pred)
prediction

array(['Havant', 'Southampton', 'Folkestone and Hythe', ...,
       'Buckinghamshire', 'Tonbridge and Malling', 'Camden'], dtype=object)