<img src="../squeemos_blk.png" width=200 height=200 />

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # makes the notebook fill the whole window

import numpy as np
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import seaborn as sns; sns.set()
import csv

from PIL import Image, ImageDraw
import os
import shutil
import glob

from mpl_toolkits.basemap import Basemap

from BetterMap import CreateMapBackground

import math

import functools
import operator

import datetime

from pathlib import Path

from Utils import *

#there is currently a bug in panda's scatter_matrix that produces a warning that has no affect on what I'm doin :)
import warnings
warnings.filterwarnings("ignore")

## Clustering functions

In [2]:
# Return kmeans cluster centers of the given dataframe
def run_kmeans(df, num_clusters=5):
    if df.shape[0] > 0:
        #TODO: use dbscan for optimal cluster number
        if df.shape[0] < num_clusters:
            num_clusters = df.shape[0]
        kmeans = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=10, algorithm='full')
        kmeans.fit(df[['Lat', 'Long']])
        df['cluster_label'] = kmeans.fit_predict(df[['Lat', 'Long']])
        centers = kmeans.cluster_centers_
        return centers
    result = np.empty((2,1))
    result[:] = np.NaN
    return result

def cluster_centers(df, num_clusters=5, section=''):
    centers = []
    df_temp = df.copy()
    if(section == 'inner'):
        df_temp = df_temp[(df_temp['distance'] <= 100)]
    elif(section == 'rainband'):
        df_temp = df_temp[(df_temp['distance'] >= 200) & (df_temp['distance'] <= 400)]
    elif(section == 'outer'):
        df_temp = df_temp[(df_temp['distance'] >= 400) & (df_temp['distance'] <= 600)]
        
    centers.append(run_kmeans(df_temp, num_clusters))
    
    #flatten the list
    centers = functools.reduce(operator.iconcat, centers, [])
    #drop nan
    centers = [x for x in centers if str(x) != 'nan']
    return centers

def segment_df(df, minutes_between):
    cluster = 0
    df_time = pd.DataFrame()
    df_time['segment'] = 0
    for x in range(len(df['Month'].unique())):
        month = df['Month'].unique()[x]
        for y in range(len(df[df['Month'] == df['Month'].unique()[x]]['Day'].unique())):
            day = df[df['Month'] == df['Month'].unique()[x]]['Day'].unique()[y]
            for z in range(len(df[df['Day'] == df[df['Month'] == df['Month'].unique()[x]]['Day'].unique()[y]]['Hour'].unique())):
                hour = df[df['Day'] == df[df['Month'] == df['Month'].unique()[x]]['Day'].unique()[y]]['Hour'].unique()[z]
                for w in range(0,60,minutes_between):
                    minute = w
                    temp = df[(df['Month'] == month) & (df['Day'] == day) & (df['Hour'] == hour) & (df['Min'] >= minute) & (df['Min'] < minute + minutes_between)]
                    temp['segment'] = cluster
                    df_time = df_time.append(temp, ignore_index=True)
                    cluster+=1
    return df_time

## Data loading and verification

In [3]:
#load in the data
df = pd.read_csv('./Irma Storm centered/ATL_17_11_Irma_WWLLN_Locations.txt',header=None,names=["Year","Month","Day","Hour","Min","Sec","Lat","Long","Dist_East_West","Dist_North_South"],low_memory=False, delim_whitespace=True)
df.dropna(axis=0,how='any',subset=['Lat','Long'],inplace=True)

df_center = pd.read_csv('./Irma Storm centered/ATL_17_11_Irma_Reduced_Trackfile.txt',header=None,names=["Year","Month","Day","Hour","Lat","Long","Min_Pressure","Max_Winds","Unused"],low_memory=False,sep='\t')
df_center.dropna(axis=0,how='any',subset=['Lat','Long'],inplace=True)

In [4]:
display(df.info())
display(df_center.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226789 entries, 0 to 226788
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Year              226789 non-null  int64  
 1   Month             226789 non-null  int64  
 2   Day               226789 non-null  int64  
 3   Hour              226789 non-null  int64  
 4   Min               226789 non-null  int64  
 5   Sec               226789 non-null  float64
 6   Lat               226789 non-null  float64
 7   Long              226789 non-null  float64
 8   Dist_East_West    226789 non-null  float64
 9   Dist_North_South  226789 non-null  float64
dtypes: float64(5), int64(5)
memory usage: 19.0 MB


None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61 entries, 0 to 60
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Year          61 non-null     int64  
 1   Month         61 non-null     int64  
 2   Day           61 non-null     int64  
 3   Hour          61 non-null     int64  
 4   Lat           61 non-null     float64
 5   Long          61 non-null     float64
 6   Min_Pressure  61 non-null     int64  
 7   Max_Winds     61 non-null     int64  
 8   Unused        61 non-null     int64  
dtypes: float64(2), int64(7)
memory usage: 4.8 KB


None

## Proof of concept/test

In [5]:
def test(df):
    #make a dataframe that denotes clusters based on a time interval - a new cluster every x amount of minutes
    minutes_between = 30
    num_clusters=5

    df['distance'] = np.sqrt((df['Dist_East_West'] * df['Dist_East_West']) + 
                                    (df['Dist_North_South'] * df['Dist_North_South']))

    #apply date restriction
    year, month, day, hour, minute = 2017, 9, 7, 16, 30

    temp = df[(df['Year'] == year) &
                       (df['Month'] == month) &
                       (df['Day'] == day) &
                       (df['Hour'] == hour) &
                       (df['Min'] == minute)]

    inner_clusters = cluster_centers(temp, num_clusters, 'inner')
    inner_clusters = [x for x in inner_clusters if str(x[0]) != 'nan']
    rainband_clusters = cluster_centers(temp, num_clusters, 'rainband')
    rainband_clusters = [x for x in rainband_clusters if str(x[0]) != 'nan']
    outer_clusters = cluster_centers(temp, num_clusters, 'outer')
    outer_clusters = [x for x in outer_clusters if str(x[0]) != 'nan']

    inner_centers = pd.DataFrame(data=inner_clusters, columns=["Lat", "Long"])
    rainband_centers = pd.DataFrame(data=rainband_clusters, columns=["Lat", "Long"])
    outer_centers = pd.DataFrame(data=outer_clusters, columns=["Lat", "Long"])

    ln = pd.read_csv('./Irma Storm centered/ATL_17_11_Irma_WWLLN_Locations.txt',header=None,names=["Year","Month","Day","Hour","Min","Sec","Lat","Long","Dist_East_West","Dist_North_South"],low_memory=False,sep=' ')
    minLong_, minLat_, maxLong_, maxLat_ = ln['Long'].min(), ln['Lat'].min(), ln['Long'].max(), ln['Lat'].max()
    buffer = 10

    plt.figure(figsize=(20,20))
    m = Basemap(llcrnrlon=minLong_-buffer, llcrnrlat=minLat_-buffer,urcrnrlon=maxLong_+buffer,urcrnrlat=maxLat_+buffer,lon_0=0,lat_0=0)
    m.drawmapboundary(fill_color='#A6CAE0', linewidth=0)
    m.fillcontinents(color='grey', alpha=0.7, lake_color='grey')
    m.drawcoastlines(linewidth=0.1, color="white")

    # Plot the lightning data
    m.plot(temp['Long'], temp['Lat'], linestyle='none', marker="o", markersize=15, alpha=0.3, c="yellow", markeredgecolor="black", markeredgewidth=1)
    m.plot(inner_centers['Long'], inner_centers['Lat'], linestyle='none', marker=".", markersize=8, alpha=0.3, c="red", markeredgecolor="black", markeredgewidth=1)
    m.plot(rainband_centers['Long'], rainband_centers['Lat'], linestyle='none', marker=".", markersize=8, alpha=0.3, c="green", markeredgecolor="black", markeredgewidth=1)
    m.plot(outer_centers['Long'], outer_centers['Lat'], linestyle='none', marker=".", markersize=8, alpha=0.3, c="blue", markeredgecolor="black", markeredgewidth=1)

    plt.show()

In [6]:
#test(df)

## Clustering to create a gif

In [7]:
minutes_between = 30

ln = pd.read_csv('./Irma Storm centered/ATL_17_11_Irma_WWLLN_Locations.txt',header=None,names=["Year","Month","Day","Hour","Min","Sec","Lat","Long","Dist_East_West","Dist_North_South"],low_memory=False,sep=' ')
minLong_, minLat_, maxLong_, maxLat_ = ln['Long'].min(), ln['Lat'].min(), ln['Long'].max(), ln['Lat'].max()
buffer = 10

df_time = segment_df(df, minutes_between)
df_time['distance'] = np.sqrt((df_time['Dist_East_West'] * df_time['Dist_East_West']) + 
                                (df_time['Dist_North_South'] * df_time['Dist_North_South']))
display(df_time.info())
df_time.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226789 entries, 0 to 226788
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   segment           226789 non-null  int64  
 1   Year              226789 non-null  float64
 2   Month             226789 non-null  float64
 3   Day               226789 non-null  float64
 4   Hour              226789 non-null  float64
 5   Min               226789 non-null  float64
 6   Sec               226789 non-null  float64
 7   Lat               226789 non-null  float64
 8   Long              226789 non-null  float64
 9   Dist_East_West    226789 non-null  float64
 10  Dist_North_South  226789 non-null  float64
 11  distance          226789 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 20.8 MB


None

Unnamed: 0,segment,Year,Month,Day,Hour,Min,Sec,Lat,Long,Dist_East_West,Dist_North_South,distance
0,1,2017.0,8.0,27.0,17.0,56.0,9.7207,12.7452,-13.496,271.571,138.46,304.831067
1,1,2017.0,8.0,27.0,17.0,56.0,19.4011,12.0782,-16.2206,-23.9866,64.2929,68.621673
2,1,2017.0,8.0,27.0,17.0,56.0,36.891,9.6958,-8.5119,820.729,-200.618,844.892699
3,1,2017.0,8.0,27.0,17.0,56.0,42.7955,10.879,-10.62,587.47,-69.052,591.514311
4,1,2017.0,8.0,27.0,17.0,56.0,46.1723,9.9119,-12.6214,370.074,-176.589,410.046876


In [14]:
def DfToGif_kmeans(df, num_clusters=5, di='./data/kmeans/final_gif/'):
    year = 2017
    edges = (minLong_,maxLong_,minLat_,maxLat_)

    end = df['segment'].max()
    if math.isnan(end):
        end = 0
    for current_segment in range(1, int(end)):
        temp = df[(df['segment'] == current_segment)]
        if(temp.shape[0] > 0):
            inner_clusters = cluster_centers(temp, num_clusters, 'inner')
            inner_clusters = [x for x in inner_clusters if str(x[0]) != 'nan']
            rainband_clusters = cluster_centers(temp, num_clusters, 'rainband')
            rainband_clusters = [x for x in rainband_clusters if str(x[0]) != 'nan']
            outer_clusters = cluster_centers(temp, num_clusters, 'outer')
            outer_clusters = [x for x in outer_clusters if str(x[0]) != 'nan']

            inner_centers = pd.DataFrame(data=inner_clusters, columns=["Lat", "Long"])
            rainband_centers = pd.DataFrame(data=rainband_clusters, columns=["Lat", "Long"])
            outer_centers = pd.DataFrame(data=outer_clusters, columns=["Lat", "Long"])

            fig = plt.figure(figsize=(30,15))
            m = CreateMapBackground(edges,buffer)

            # Plot the lightning data
            m.plot(temp['Long'], temp['Lat'], linestyle='none', marker="o", markersize=15, alpha=0.3, c="yellow", markeredgecolor="black", markeredgewidth=1)
            m.plot(inner_centers['Long'], inner_centers['Lat'], linestyle='none', marker=".", markersize=8, alpha=0.3, c="red", markeredgecolor="black", markeredgewidth=1)
            m.plot(rainband_centers['Long'], rainband_centers['Lat'], linestyle='none', marker=".", markersize=8, alpha=0.3, c="green", markeredgecolor="black", markeredgewidth=1)
            m.plot(outer_centers['Long'], outer_centers['Lat'], linestyle='none', marker=".", markersize=8, alpha=0.3, c="blue", markeredgecolor="black", markeredgewidth=1)

            month, day, hour, minute = int(temp['Month'].iloc[0]), int(temp['Day'].iloc[0]), int(temp['Hour'].iloc[0]), int(temp['Min'].iloc[0])
            #Save and close the figure
            plt.savefig('{}{}_{}_{}_{}_{}'.format(di,year,month,day,hour,minute),bbox_inches='tight')
            plt.close(fig)

In [16]:
%%time

DfToGif_kmeans(df_time)

KeyboardInterrupt: 

In [17]:
MakeGif('./data/kmeans/final_gif', './data/kmeans/', 'kmeans', 100, 'png')

In [18]:
ClearDir('./data/kmeans/final_gif')

Delete files from ./data/kmeans/final_gif y/n?
y


# TODO
- Great Circle Distance option for comparing
- Turn into a class w/ good helper functions
    - KMEANS on storm data
    - Fit, Predict (for any kind of data)
- DBSCAN for optimal k value for each segment

In [None]:
#DEPRECATED
#elbow graph for visual estimation of ideal cluster count
# clusters = range(1,16)

# kmeans = [KMeans(n_clusters=i) for i in clusters]

# y = df[['Lat']]
# x = df[['Long']]

# score = [kmeans[i].fit(y).score(y) for i in range(len(kmeans))]

# plt.plot(clusters, score)

# plt.show()

# #kmeans using the set cluster count
# kmeans = KMeans(n_clusters=4, init='k-means++')
# run_kmeans = coord_df
# kmeans.fit(run_kmeans)
# run_kmeans['segment'] = kmeans.fit_predict(run_kmeans)
# inertia = kmeans.inertia_
# labels = kmeans.predict(run_kmeans[run_kmeans.columns[:2]])
# centers = kmeans.cluster_centers_

# run_kmeans.plot.scatter(x='Lat',y='Long', c=labels, s=50, cmap='spring', figsize=(20,20))
# plt.scatter(centers[:, 0], centers[:, 1], c='black', s= 200, alpha=.5)

# plt.show()

# for n_clusters in range(1,16):
#     kmeans = KMeans(n_clusters=n_clusters, init='k-means++')
#     kmeans.fit(run_kmeans)
#     inertia = kmeans.inertia_
#     print("Cluster count: ", n_clusters, "\tInertia: ", inertia)