In [4]:
import pandas as pd
import plotly.express as px
import json
from firebase import firebase

In [5]:
#Tutorial we followed for reading data in firebase from python: https://morioh.com/p/4dca3ded4cea

firebase = firebase.FirebaseApplication('https://safer-walks-default-rtdb.firebaseio.com/', None)
parents_data = firebase.get('/RegisteredParents', '')

In [6]:
#reading schoolsdata file which contains all schools in Sweden.
schools_file = 'data/schools_data.csv'
df_schools = pd.read_csv(schools_file)
df_schools

Unnamed: 0,school_long,school_lat,osm_id,school_name
0,11.939127,57.694679,4742340,Fjällskolan
1,11.945730,57.696740,4768432,Oscar Fredriksskolan
2,18.116401,59.382723,4875003,Sticklinge skola
3,17.914796,59.482734,6062883,Rotsunda skola
4,17.920795,59.465465,13862875,Stallets skola
...,...,...,...,...
3105,14.499669,59.316717,871653648,Skrantaskolan
3106,14.077498,57.894733,871990590,I Ur och Skur Oxdragaren
3107,14.082188,57.919182,872880398,Förskolan Äventyret
3108,13.931429,55.456706,874056551,Köpingebro skola


In [7]:
#Creating a dataframe from parents_data. 
df_parents = pd.DataFrame()
for i, parent in enumerate(parents_data.values()):
    df_parents = df_parents.append(pd.DataFrame(parent, index=[i]), ignore_index=True)

In [8]:
#Splitting userLocation from df_parents, into latitude: user_lat and longitude: user_long.
df_parents['user_lat'] = [float(s.split(',')[0]) for s in df_parents['userLocation']]
df_parents['user_long'] = [float(s.split(',')[1]) for s in df_parents['userLocation']]

In [9]:
#Merging df_schools into df_parents by schoolId(df_parents) and osm_id(df_schools)
df = pd.merge(df_parents, df_schools, left_on='schoolId', right_on='osm_id')

In [38]:
df.shape

(109, 10)

In [11]:
df


Unnamed: 0,parentName,schoolId,timeRegistration,userLocation,user_lat,user_long,school_long,school_lat,osm_id,school_name
0,Johan Eriksson,299737854,"24, Dec, 2020, 12:05","59.2764315,17.8900085",59.276432,17.890009,17.899835,59.273127,299737854,Västerholms friskola
1,Sofia Andersson,299737854,"24, Dec, 2020, 14:05","59.2770355,17.8899913",59.277035,17.889991,17.899835,59.273127,299737854,Västerholms friskola
2,Manpreet Singh,299737854,"24, Dec, 2020, 15:28","59.2793401,17.8918366",59.279340,17.891837,17.899835,59.273127,299737854,Västerholms friskola
3,Petra Johansson,299737854,"24, Dec, 2020, 15:36","59.2761426,17.8854605",59.276143,17.885461,17.899835,59.273127,299737854,Västerholms friskola
4,Joe Tribiani,299737854,"01, Jan, 2021, 15:24","59.2803346,17.8913531",59.280335,17.891353,17.899835,59.273127,299737854,Västerholms friskola
...,...,...,...,...,...,...,...,...,...,...
104,Julia Roberts,208341119,"04, Jan, 2021, 10:06","59.854693,17.6817418",59.854693,17.681742,17.691929,59.859437,208341119,Livets Ords kristna skola
105,Rina Dey,209257132,"04, Jan, 2021, 10:42","59.8563024,17.6502813",59.856302,17.650281,17.637176,59.862557,209257132,Raoul Wallenbergskolan
106,Erika Skårgard,209257132,"04, Jan, 2021, 10:42","59.8563024,17.6502813",59.856302,17.650281,17.637176,59.862557,209257132,Raoul Wallenbergskolan
107,Tom Cruise,209257132,"04, Jan, 2021, 10:42","59.8563024,17.6502813",59.856302,17.650281,17.637176,59.862557,209257132,Raoul Wallenbergskolan


In [12]:
df.groupby("school_name").count()

Unnamed: 0_level_0,parentName,schoolId,timeRegistration,userLocation,user_lat,user_long,school_long,school_lat,osm_id
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Almtunaskolans matsal,3,3,3,3,3,3,3,3,3
Edsbergsskolan,11,11,11,11,11,11,11,11,11
Gärdeskolan,10,10,10,10,10,10,10,10,10
Hagalundsskolan,2,2,2,2,2,2,2,2,2
Kärrdalsskolan,5,5,5,5,5,5,5,5,5
Livets Ords kristna skola,5,5,5,5,5,5,5,5,5
Nordiska musikgymnasiet,2,2,2,2,2,2,2,2,2
Pysslingen,1,1,1,1,1,1,1,1,1
Pärlan,3,3,3,3,3,3,3,3,3
Raoul Wallenbergskolan,4,4,4,4,4,4,4,4,4


In [13]:
dff = df.loc[df["schoolId"] == 70223549 ]


In [14]:
numerical = ["user_long", "user_lat"]

X = dff[[c for c in dff.columns if c in numerical]]


X = pd.DataFrame(X, columns=numerical)
y = dff.iloc[:, -1]

In [15]:
from sklearn.cluster import KMeans
from sklearn import metrics

In [16]:

kmeans = KMeans(n_clusters=3)

#compute kmeans clustering
kmeans.fit(X)
#call the kmeans object, predict, to predict the cluster index of each sample. 

y_kmeans = kmeans.predict(X)

In [17]:
y_kmeans

array([1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0])

In [18]:
def evaluation_metrics(X, labels_pred, metric_name):


        s_s = metrics.silhouette_score(X, labels_pred, metric=metric_name)
       

        metrics_names = ["Silh_S"]
        values = [s_s]
        

        result = list(zip(metrics_names,values))
        result = pd.DataFrame(result, columns=['Metric','Value'])

        return result


In [19]:
evaluation_metrics(X, kmeans.labels_, "euclidean" )

Unnamed: 0,Metric,Value
0,Silh_S,0.708996


In [20]:
import hdbscan

In [21]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=3, metric = 'haversine')
clusterer.fit(X)

HDBSCAN(metric='haversine', min_cluster_size=3)

In [22]:
hdbscan_result = clusterer.labels_


In [23]:
hdbscan_result

array([1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0], dtype=int32)

In [24]:
evaluation_metrics(X, clusterer.labels_, "euclidean" )

Unnamed: 0,Metric,Value
0,Silh_S,0.708996


In [25]:
from sklearn.cluster import AgglomerativeClustering

In [26]:
model = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')


In [27]:
model.fit(X)


AgglomerativeClustering(n_clusters=3)

In [28]:
labels = model.labels_

In [29]:
labels


array([2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1], dtype=int32)

In [30]:
evaluation_metrics(X, model.labels_, "euclidean" )

Unnamed: 0,Metric,Value
0,Silh_S,0.708996


In [31]:
from sklearn.cluster import SpectralClustering

In [32]:
sc = SpectralClustering(n_clusters = 2, affinity='nearest_neighbors', n_init=100)


In [33]:
sc.fit(X)

SpectralClustering(affinity='nearest_neighbors', n_clusters=2, n_init=100)

In [34]:
 sc.labels_


array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1])

In [35]:
evaluation_metrics(X, sc.labels_, "euclidean" )

Unnamed: 0,Metric,Value
0,Silh_S,0.597806
