In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

new_data = pd.read_csv("modified_dataset.csv")

In [3]:
df = pd.DataFrame(new_data)
df = df[["fatalities", "landslide_size", "latitude", "longitude"]]

In [4]:
df = pd.get_dummies(df) # one-hot-encoding'landslide_size' (i.e. dummy_variable)

In [5]:
df.head()

Unnamed: 0,fatalities,latitude,longitude,landslide_size_Large,landslide_size_Medium,landslide_size_Small
0,10.0,33.7033,73.038,0,1,0
1,0.0,49.0022,-122.7576,0,0,1
2,0.0,41.549,-124.0611,0,0,1
3,13.0,30.6945,79.0544,1,0,0
4,0.0,34.2266,-118.0306,0,0,1


In [6]:
practice_dataset = df[["latitude", "longitude"]] # Making 2D-array for KMeans
practice_dataset.shape

(100, 2)

In [7]:
practice_dataset.head() 

Unnamed: 0,latitude,longitude
0,33.7033,73.038
1,49.0022,-122.7576
2,41.549,-124.0611
3,30.6945,79.0544
4,34.2266,-118.0306


In [8]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5).fit(practice_dataset) # Categorizing the dataset into 5 clusters
kmeans.cluster_centers_ # latitude and longitude of cluster centers

array([[  -6.45669121,  -59.94776733],
       [  -4.10468333,  134.67504167],
       [  17.99423435,   25.45409223],
       [  39.91839171, -112.51914052],
       [  25.06989259,   82.35702963]])

In [9]:
cluster_array = kmeans.predict(practice_dataset) # predicting all the dataset's cluster numbers and make it into array
cluster_array

array([4, 3, 3, 4, 3, 3, 4, 4, 0, 2, 1, 3, 3, 4, 1, 0, 0, 1, 4, 4, 2, 3, 4,
       0, 4, 2, 4, 4, 1, 0, 1, 3, 3, 0, 4, 4, 4, 1, 1, 3, 3, 0, 3, 0, 0, 0,
       4, 4, 3, 1, 4, 3, 4, 3, 3, 3, 1, 4, 2, 4, 4, 1, 3, 4, 0, 2, 1, 3, 3,
       3, 3, 3, 3, 3, 2, 3, 4, 4, 4, 2, 2, 3, 3, 2, 3, 3, 2, 0, 2, 2, 4, 2,
       0, 4, 0, 3, 3, 1, 0, 3], dtype=int32)

In [10]:
df.insert(loc=6, column='cluster_number', value=cluster_array) # adding cluster_array to df as 'cluster_number'

In [11]:
df.head()

Unnamed: 0,fatalities,latitude,longitude,landslide_size_Large,landslide_size_Medium,landslide_size_Small,cluster_number
0,10.0,33.7033,73.038,0,1,0,4
1,0.0,49.0022,-122.7576,0,0,1,3
2,0.0,41.549,-124.0611,0,0,1,3
3,13.0,30.6945,79.0544,1,0,0,4
4,0.0,34.2266,-118.0306,0,0,1,3


In [12]:
groupby_cluster = df['fatalities'].groupby(df['cluster_number']) # grouping fatalities by cluster #
groupby_cluster

<pandas.core.groupby.SeriesGroupBy object at 0x10cb14f50>

In [13]:
groupby_cluster.sum() # summing the number of fatalities by cluster number

cluster_number
0     20.0
1     10.0
2     42.0
3      1.0
4    106.0
Name: fatalities, dtype: float64

In [15]:
groupby_cluster.count() # looking at the distribution based on cluster number

cluster_number
0    15
1    12
2    13
3    33
4    27
Name: fatalities, dtype: int64