In [16]:
# import necessary modules
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import netCDF4 as nc
from mpl_toolkits.basemap import Basemap
from sklearn.decomposition import PCA
import matplotlib.gridspec as gridspec
from matplotlib.backends.backend_pdf import PdfPages
from mpl_toolkits.axes_grid1 import make_axes_locatable
from datetime import datetime, timedelta
import pandas as pd
from scipy.spatial import distance_matrix
from matplotlib import cm as cm
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import math

In [17]:
#get the file path for loading, data file is under the same dir with the notebook
filename="20121015_00_ecmwf_ensemble_forecast.PRESSURE_LEVELS.EUR_LL10.120.pl.nc"
foldername="ECWMF Datasets"
filepath=os.path.join(os.path.dirname(os.getcwd()),foldername,filename)

# read the raw data and extract the needed data
# exrtact the value of Geopotential under the pressure of 500 hPA in the certain
Pressure_Levels_data = nc.Dataset(filepath,"r")
g = 9.80655
# get all the dimension value
nd_1,nd_2,nd_3,nd_4,nd_5 = Pressure_Levels_data.variables['Geopotential_isobaric'][:].shape
# get the necessary raw data
Geopotential_Isobaric_500 = Pressure_Levels_data.variables['Geopotential_isobaric'][0,:,7,:,:]/g
# reshape the dataset into form of (51,41*101)
Geopotential_Isobaric_500_reshaped = np.reshape(Geopotential_Isobaric_500,(nd_2, nd_4 * nd_5))
# prepare the longitude and latitude value for contour
longitude = Pressure_Levels_data['lon'][:]
latitude = Pressure_Levels_data['lat'][:]
(lon, lat) = np.meshgrid(longitude, latitude)

# use PCA to reduce dimensions under the condition of reaching 80% of all the member infomation
exp_var = 0
n_pc = 0
while exp_var < 0.8:
    n_pc = n_pc + 1
    pca = PCA(n_components = n_pc)
    pca.fit(Geopotential_Isobaric_500_reshaped)
    exp_var = sum(pca.explained_variance_ratio_)

# get the transformed raw data in the dimension-reduced space    
pca_transformed_data = pca.transform(Geopotential_Isobaric_500_reshaped)

#get the time variable
times=Pressure_Levels_data.variables["time"]
#get the time number
arrDateEnd=nc.num2date(times[:],units=times.units)
#get the time in date format
dateEndDate = datetime.date(arrDateEnd[0]).strftime("%d %b %Y")
dateEndMin = datetime.date(arrDateEnd[0]).strftime("%H:%M")
dateStart=datetime.date(arrDateEnd[0])-timedelta(hours=120)
dateStartDate=dateStart.strftime("%d %b %Y")
dateStartMin=dateStart.strftime("%H:%M")


In [18]:
def bandwidth_selection(dataset,bandwidth):    
    significanceLower = int(51*0.3)
    outlierUpper = 2

    meanshift = MeanShift(bandwidth=bandwidth)
    meanshift.fit(dataset)
    labels = meanshift.labels_

    from collections import Counter
    dicLabels=Counter(labels)

    sigModeNum=0
    outlierModeNum=0
    for i in dicLabels:
        if dicLabels[i] >= significanceLower:
            sigModeNum = sigModeNum + 1
        else:
            outlierModeNum = outlierModeNum + 1
    if outlierModeNum > outlierUpper:
            print("Number of outlier exceeds 2")
            print(labels)
    else:
        print("Number of sig mode is {0}".format(sigModeNum)) 
        print(labels)

In [19]:
def gauKern(point,h,isocontourSet):
    PI=math.pi
    p=point.shape[0]

    n=len(isocontourSet)
    sum=0
    
    for i in range(n):
        diff=point-isocontourSet[i]
        temp =np.exp(-np.sum(diff**2)/2*(h**2))/(np.sqrt(2*PI)*(h**p))
        sum=sum+temp
        
    return sum/n 

In [None]:
def get_limit(number):
    if number<0:
        temp=np.abs(number)
        c=1  
        while (temp//10)!=0:
            temp=temp/10
            c+=1
        number=-1*(-1*number+10**(c-1))
    else:
        temp=number
        c=1  
        while (temp//10)!=0:
            temp=temp/10
            c+=1
        number=number+10**(c-1)
        
    return number

In [23]:
def pointSet(isocontourSet,interval):
    maxnum=isocontourSet.max()
    minnum=isocontourSet.min()
    maxlim=get_limit(maxnum)
    minlim=get_limit(minnum)
    
    dim=isocontourSet.shape[1]
    oneDim=np.linspace(minlim,maxlim,interval)
    
    multiDim=[oneDim for i in range(dim)]
    grid=np.meshgrid(*multiDim)
    
    reshape=[np.reshape(grid[j],(interval**dim)) for j in range(dim)]
    
    result= np.vstack(reshape).T
    return resultlt

In [13]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(KernelDensity(),{'bandwidth': np.linspace((pca_transformed_data[:,0]).min(), (pca_transformed_data[:,0]).max(), 1000)},cv=20) # 20-fold cross-validation
grid.fit(pca_transformed_data)
print(grid.best_params_)

{'bandwidth': 67.80435593063766}




In [None]:
def Get_Cluster_Label(data,bandwidth):
    density=[gauKern(data[i],0.001,data) for i in range(len(data))]
    for index,value in enumerate(density):