In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, precision_recall_curve, roc_auc_score

In [43]:
df = pd.read_pickle('categorised_accessible_pois.pkl')

In [44]:
df.columns

Index(['id', 'access', 'barrier', 'bicycle', 'motor_vehicle', 'opening_hours',
       'wheelchair', 'lon', 'lat', 'barrier_cat', 'wheelchair_cat',
       'access_cat'],
      dtype='object')

In [79]:
df_x = df[['barrier_cat', 'access_cat', 'bicycle', 'motor_vehicle']]

In [80]:
dummy_categories = {
    'yes': 1,
    'no': 0,
    'limited': 0,
    'designated': 1
}

In [81]:
df['wheelchair_dummy'] = df['wheelchair_cat'].map(dummy_categories)

In [82]:
y = df.wheelchair_dummy.values

In [83]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [84]:
df_x.head()

Unnamed: 0,barrier_cat,access_cat,bicycle,motor_vehicle
99878,gate,permissive,no,no
104734,,,,
106213,,,,
108042,,,,
251191,,,,


In [85]:
enc = OneHotEncoder(handle_unknown = 'ignore')
x_encoded = enc.fit_transform(df_x)

In [86]:
x_encoded

<29282x30 sparse matrix of type '<class 'numpy.float64'>'
	with 117128 stored elements in Compressed Sparse Row format>

In [87]:
x_train, x_test, y_train, y_test = train_test_split(x_encoded, y)

In [88]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(21961, 30)
(7321, 30)
(21961,)
(7321,)


In [89]:
y_train

array([1, 1, 0, ..., 1, 1, 1])

In [90]:
svm = SVC()

parameters = {'C':[0.1, 1, 10], 'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1]}
searcher = GridSearchCV(svm, parameters)
searcher.fit(x_train, y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10],
                         'gamma': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1]})

In [91]:
print("Best CV parameters", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)
# Report the test accuracy using these best parameters
print("Test accuracy of best grid search hypers:", searcher.score(x_test, y_test))

Best CV parameters {'C': 0.1, 'gamma': 0.1}
Best CV accuracy 0.6863982980043181
Test accuracy of best grid search hypers: 0.684196148067204


In [115]:
Classifier=SVC(kernel='linear', C=0.1, gamma= 0.1)
Classifier.fit(x_train, y_train)
predicted=Classifier.predict(x_test)
print(accuracy_score(y_test, predicted))

0.6848791148750171


In [104]:
#add tagged data
tagged_df = pd.read_excel('UK_accessibility_Data.xlsx')

In [108]:
tagged_df.head()

Unnamed: 0,place_tags,id,latitude,longitude,tags
0,gate_barrier,99878,51.524358,-0.152985,"{'access': 'permissive', 'barrier': 'gate', 'b..."
1,railway station,104734,51.565653,-1.785876,"{'name': 'Swindon', 'network': 'National Rail'..."
2,crossing,106213,51.52566,-0.142942,"{'crossing': 'unmarked', 'wheelchair': 'yes', ..."
3,pub,108042,51.523561,-0.135513,"{'addr:housenumber': '31', 'addr:postcode': 'W..."
4,tourism_viewpoint,251191,51.115444,-0.715918,"{'name': 'Gibbet Hill', 'tourism': 'viewpoint'..."


In [107]:
tagged_df['place_tags'].unique()

array(['gate_barrier', 'railway station', 'crossing', 'pub',
       'tourism_viewpoint', 'hotel', 'taxi_rank', 'parking',
       'railway station,subway', 'attraction', 'railway_stop',
       'tram_stop', 'post_office', 'restaurant', 'museum',
       'convenience shop', nan, 'fuel', 'natural_peak',
       'crossing,raised_kerb', 'bar', 'toilets', 'library',
       'place_of_worship', 'hospital', 'atm', 'crossing,lowered_kerb',
       'community_centre', 'newsagent', 'bicycle_shop', 'police_station',
       'bus_station', 'bank', 'cinema', 'cafe',
       'public_transport_station', 'pharmacy', 'supermarket',
       'cycle_barrier', 'school', 'hostel', 'historic_memorial',
       'artwork_tourism', 'subway_entrance', 'fast_food', 'theatre',
       'elevator', 'disused_amenity_pub', 'hotel,restaurant', 'bus_stop',
       'garden_centre_shop', 'alcohol_shop', 'department_store',
       'pub,hotel', 'archaeological_site_historic', 'courthouse',
       'clothing_shop', '***NO PLACE TAG***', 

In [110]:
merged_df = pd.merge(df, tagged_df, left_on = 'id', right_on = 'id')

In [111]:
merged_df.head()

Unnamed: 0,id,access,barrier,bicycle,motor_vehicle,opening_hours,wheelchair,lon,lat,barrier_cat,wheelchair_cat,access_cat,wheelchair_dummy,place_tags,latitude,longitude,tags
0,99878,permissive,gate,no,no,dawn-dusk,yes,-0.152985,51.524358,gate,yes,permissive,1,gate_barrier,51.524358,-0.152985,"{'access': 'permissive', 'barrier': 'gate', 'b..."
1,104734,,,,,,yes,-1.785876,51.565653,,yes,,1,railway station,51.565653,-1.785876,"{'name': 'Swindon', 'network': 'National Rail'..."
2,106213,,,,,,yes,-0.142942,51.52566,,yes,,1,crossing,51.52566,-0.142942,"{'crossing': 'unmarked', 'wheelchair': 'yes', ..."
3,108042,,,,,Mo-We 16:00-23:30; Th-Fr 16:00-01:00; Sa 16:00...,limited,-0.135513,51.523561,,limited,,0,pub,51.523561,-0.135513,"{'addr:housenumber': '31', 'addr:postcode': 'W..."
4,251191,,,,,,limited,-0.715918,51.115444,,limited,,0,tourism_viewpoint,51.115444,-0.715918,"{'name': 'Gibbet Hill', 'tourism': 'viewpoint'..."


In [112]:
merged_df.columns

Index(['id', 'access', 'barrier', 'bicycle', 'motor_vehicle', 'opening_hours',
       'wheelchair', 'lon', 'lat', 'barrier_cat', 'wheelchair_cat',
       'access_cat', 'wheelchair_dummy', 'place_tags', 'latitude', 'longitude',
       'tags'],
      dtype='object')

In [126]:
tagged_subset = merged_df[['barrier_cat', 'place_tags', 'access_cat', 'bicycle', 'motor_vehicle']] 

In [128]:
tagged_y = merged_df.wheelchair_dummy.values

In [129]:
tagged_x = enc.fit_transform(tagged_subset)

In [130]:
tagged_x_train, tagged_x_test, tagged_y_train, tagged_y_test = train_test_split(tagged_x, tagged_y)

In [132]:
svm = SVC()

parameters = {'C':[0.1, 1, 10], 'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1]}
searcher = GridSearchCV(svm, parameters)
searcher.fit(tagged_x_train, tagged_y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10],
                         'gamma': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1]})

In [133]:
print("Best CV parameters", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)
# Report the test accuracy using these best parameters
print("Test accuracy of best grid search hypers:", searcher.score(tagged_x_test, tagged_y_test))

Best CV parameters {'C': 10, 'gamma': 0.01}
Best CV accuracy 0.7144481388464924
Test accuracy of best grid search hypers: 0.7078267996175386


In [134]:
Classifier=SVC(kernel='linear', C=10, gamma= 0.01)
Classifier.fit(tagged_x_train, tagged_y_train)
tagged_predicted=Classifier.predict(tagged_x_test)
print(accuracy_score(tagged_y_test, tagged_predicted))

0.7090561398716022
