In [1]:
# import necessary modules
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import netCDF4 as nc
from mpl_toolkits.basemap import Basemap
from sklearn.decomposition import PCA
import matplotlib.gridspec as gridspec
from matplotlib.backends.backend_pdf import PdfPages
from mpl_toolkits.axes_grid1 import make_axes_locatable
from datetime import datetime, timedelta
import pandas as pd
from scipy.spatial import distance_matrix
from matplotlib import cm as cm
from matplotlib.collections import PatchCollection
from matplotlib.patches import Polygon
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift
from scipy.spatial.distance import cdist
import math
import debacl as dbc

In [2]:
#get the file path for loading, data file is under the same dir with the notebook
filename="20121015_00_ecmwf_ensemble_forecast.PRESSURE_LEVELS.EUR_LL10.120.pl.nc"
foldername="ECWMF Datasets"
filepath=os.path.join(os.path.dirname(os.getcwd()),foldername,filename)

# read the raw data and extract the needed data
# exrtact the value of Geopotential under the pressure of 500 hPA in the certain
Pressure_Levels_data = nc.Dataset(filepath,"r")
g = 9.80655
# get all the dimension value
nd_1,nd_2,nd_3,nd_4,nd_5 = Pressure_Levels_data.variables['Geopotential_isobaric'][:].shape
# get the necessary raw data
Geopotential_Isobaric_500 = Pressure_Levels_data.variables['Geopotential_isobaric'][0,:,7,:,:]/g
# reshape the dataset into form of (51,41*101)
Geopotential_Isobaric_500_reshaped = np.reshape(Geopotential_Isobaric_500,(nd_2, nd_4 * nd_5))
# prepare the longitude and latitude value for contour
longitude = Pressure_Levels_data['lon'][:]
latitude = Pressure_Levels_data['lat'][:]
(lon, lat) = np.meshgrid(longitude, latitude)

# use PCA to reduce dimensions under the condition of reaching 80% of all the member infomation
exp_var = 0
n_pc = 0
while exp_var < 0.8:
    n_pc = n_pc + 1
    pca = PCA(n_components = n_pc)
    pca.fit(Geopotential_Isobaric_500_reshaped)
    exp_var = sum(pca.explained_variance_ratio_)

# get the transformed raw data in the dimension-reduced space    
pca_transformed_data = pca.transform(Geopotential_Isobaric_500_reshaped)

#get the time variable
times=Pressure_Levels_data.variables["time"]
#get the time number
arrDateEnd=nc.num2date(times[:],units=times.units)
#get the time in date format
dateEndDate = datetime.date(arrDateEnd[0]).strftime("%d %b %Y")
dateEndMin = datetime.date(arrDateEnd[0]).strftime("%H:%M")
dateStart=datetime.date(arrDateEnd[0])-timedelta(hours=120)
dateStartDate=dateStart.strftime("%d %b %Y")
dateStartMin=dateStart.strftime("%H:%M")


In [3]:
def gauKern(point,h,isocontourSet):
    PI=math.pi
    p=point.shape[0]

    n=len(isocontourSet)
    sum=0
    
    for i in range(n):
        diff=point-isocontourSet[i]
        temp =np.exp(-np.sum(diff**2)/2*(h**2))/(np.sqrt(2*PI)*(h**p))
        sum=sum+temp
        
    return sum/n 

In [4]:
def get_limit(number):
    if number<0:
        temp=np.abs(number)
        c=1  
        while (temp//10)!=0:
            temp=temp/10
            c+=1
        number=-1*(-1*number+10**(c-1))
    else:
        temp=number
        c=1  
        while (temp//10)!=0:
            temp=temp/10
            c+=1
        number=number+10**(c-1)
        
    return number

In [5]:
def pointSet(isocontourSet,interval):
    maxnum=isocontourSet.max()
    minnum=isocontourSet.min()
    maxlim=get_limit(maxnum)
    minlim=get_limit(minnum)
    
    dim=isocontourSet.shape[1]
    oneDim=np.linspace(minlim,maxlim,interval)
    
    multiDim=[oneDim for i in range(dim)]
    grid=np.meshgrid(*multiDim)
    
    reshape=[np.reshape(grid[j],(interval**dim)) for j in range(dim)]
    
    result= np.vstack(reshape).T
    return resultlt

In [None]:
def Get_Cluster_Label(data,bandwidth):
    density=[gauKern(data[i],0.001,data) for i in range(len(data))]
    for index,value in enumerate(density):

In [25]:
gauKern(pca_transformed_data[0],600,pca_transformed_data)

7.762089325705055e-28

In [72]:
def Get_Area_Data_General(rawdataToCalc,lon_rawdataToCalc,lat_rawdataToCalc,bottomLeft,topRight):
    #def Get_Subgrid(lon_rawdataToCalc,lat_rawdataToCalc,bottomLeft,topRight):
        lonStart=(np.abs(lon_rawdataToCalc - bottomLeft[0])).argmin()
        lonEnd=(np.abs(lon_rawdataToCalc - topRight[0])).argmin()
        latStart=(np.abs(lat_rawdataToCalc - bottomLeft[1])).argmin()
        latEnd=(np.abs(lat_rawdataToCalc - topRight[1])).argmin()

        areaData=rawdataToCalc[:,latEnd:latStart+1,lonStart:lonEnd+1]
        return areaData

def Reshape_New_Data(areaData):
    dim1,dim2,dim3=areaData.shape
    reshapedData=np.reshape(areaData,(dim1,dim2*dim3))
    return reshapedData

def PCA_Run(reshapedData):
    exp_var = 0
    n_pc = 0
    while exp_var < 0.8:
        n_pc = n_pc + 1
        pca = PCA(n_components = n_pc)
        pca.fit(reshapedData)
        exp_var = sum(pca.explained_variance_ratio_)
        
    return pca

from scipy.stats import scoreatpercentile as sap
from statsmodels.sandbox.nonparametric import kernels
from collections import Counter

def _select_sigma(X):
#    normalize = norm.ppf(.75) - norm.ppf(.25)
    normalize = 1.349
#    IQR = np.subtract.reduce(percentile(X, [75,25],
#                             axis=axis), axis=axis)/normalize
    IQR = (sap(X, 75) - sap(X, 25))/normalize
    return np.minimum(np.std(X, axis=0, ddof=1), IQR)

def bw_scott(x, kernel=kernels.Gaussian):
    A = _select_sigma(x)
    n = len(x)
    return 1.059 * A * n ** (-0.2)

def meanshift(sig,dataset, bandwidth):
    meanshift = MeanShift(bandwidth=bandwidth)  
    meanshift.fit(dataset)
    labels = meanshift.labels_
    dicLabels=Counter(labels)
    innerSigModeNum=0
    innerOutlierModeNum=0

    for i in dicLabels:
        if dicLabels[i] >= sig:
            innerSigModeNum = innerSigModeNum + 1
        else:
            innerOutlierModeNum = innerOutlierModeNum + 1
            
    return innerSigModeNum,innerOutlierModeNum,dicLabels,labels

def bandwidth_selection(dataset,bandwidth):    
    loop=True
    significanceLower = int(len(dataset)*0.3)
    outlierUpper = 2

    sigModeNum=0
    outlierModeNum=0
    n=1

    innerSigModeNum,innerOutlierModeNum,dicLabels,labels=meanshift(significanceLower,dataset,bandwidth)
    if innerOutlierModeNum>outlierUpper:
        while loop==True:
            print("bandwidth: {0}, labels: {1}, exicution times: {2}".format(bandwidth,dicLabels,n))
            n=n+1
            bandwidth=bandwidth+1
            innerSigModeNum,innerOutlierModeNum,dicLabels,labels=meanshift(significanceLower,dataset,bandwidth)
            if innerOutlierModeNum<outlierUpper:
                loop == False
                break
    else:
        while loop==True:
            print("bandwidth: {0}, labels: {1}, exicution times: {2}".format(bandwidth,dicLabels,n))
            n=n+1
            bandwidth=bandwidth-1
            innerSigModeNum,innerOutlierModeNum,dicLabels,labels=meanshift(significanceLower,dataset,bandwidth)
            if innerOutlierModeNum>=outlierUpper:
                print("bandwidth: {0}, labels: {1}, exicution times: {2}".format(bandwidth,dicLabels,n))
                n=n+1
                loop == False
                break

In [7]:
import os
import netCDF4 as nc
import numpy as np
#get the file path for loading, data file is under the same dir with the notebook
filename="20121017_12_ecmwf_forecast.PRESSURE_LEVELS.EUR_LL015.036.pl.nc"
foldername="ECWMF Datasets"
filepath=os.path.join(os.path.dirname(os.getcwd()),foldername,filename)
syn = nc.Dataset(filepath,"r")
#syn.variables
longitude1 = syn['lon'][:]
latitude1 = syn['lat'][:]
(lon1, lat1) = np.meshgrid(longitude1, latitude1)
syndata =syn.variables['Geopotential_isobaric'][:]/9.8
syndataiso1 = syndata[0,:,0,:,:]
syndataiso2 = syndata[0,:,1,:,:]
syndataiso3 = syndata[0,:,2,:,:]
secData1  = Get_Area_Data_General(syndataiso1,longitude1,latitude1,[0,40],[20,60])
reshapeData1=Reshape_New_Data(secData1)
PCA_1=PCA_Run(reshapeData1)
transformedData1=PCA_1.transform(reshapeData1)

In [32]:
from sklearn.cluster import estimate_bandwidth as bwe
from collections import Counter
meanshift = MeanShift(bandwidth=114)
meanshift.fit(transformedData1)
print(meanshift.labels_)
print(Counter(meanshift.labels_))

[2 2 0 1 1 0 1 2 2 0 2 1 0 1 2 1 2 0 0 2 0 1 0 0 0 1 0 2 0 0 1 0 0 0 0 0 2
 0 1 1 0 0 2 0 0 0 2 1 0 0 0]
Counter({0: 27, 2: 12, 1: 12})


In [73]:
bandwidth_selection(transformedData1,120)

bandwidth: 114, labels: Counter({0: 27, 2: 12, 1: 12}), exicution times: 1
