In [2]:
import matplotlib.pyplot as plt
import numpy as np
import re
import os
import pandas as pd


In [10]:
def precipitations(year,month,day):                    # Cette fonction permet simplement de représenter l'évolutiond es précipitations en Ile-de-France sur une journée.
    path = 'data/'+'{:04d}/'.format(year)
    file = 'RR_IDF300x300_{:04d}{:02d}{:02d}.npy'.format(year,month,day)
    file = path + file
    RR = np.load(file)/100.0
    RR[RR < 0]=np.nan
    plt.figure()
    plt.ion()        
    for i in range (288):
        plt.imshow(RR[i,:,:], cmap='Blues')
        #plt.show(block=True)
        #cv2.namedWindow('image', cv2.WINDOW_NORMAL)
        #cv2.imshow('image', RR[i,:,:], cmap='Blues')
        plt.pause(0.2)
        plt.clf()

def segmentation_evenements(year,month,day):        # Cette fonction permet de séparer les différents événements de précipitations.
    path = 'data/'+'{:04d}/'.format(year)
    file = 'RR_IDF300x300_{:04d}{:02d}{:02d}.npy'.format(year,month,day)
    file = path + file
    RR = np.load(file)/100.0
    RR[RR < 0]=np.nan
    RR_seg = np.zeros([288,300,300])        # RR_seg sera une carte simplifiée indiquant simplement où un événement de précpitation est en cours. C'est un tableau binaire.
    for i in range (300):
        for j in range (300):
            t = 0
            t1 = 0
            while (t<288):
                if (RR[t,i,j]>0):
                    RR_seg[t,i,j] = 1
                    t1 = 0      # Compteur du temps à partir du dernier instant de pluie. S'il atteint 6, alors l'événement de pluie est terminé.
                    t = t+1
                    t0 = t      # t0 permet de garder en mémoire le premier instant de l'acalmie.
                    while (t<288 and (RR[t,i,j]==np.nan) and t1<6):      # Deux averses au même endroit font partie du même événement pluvieux si elles adviennent à moins de 30 min d'intervalle.
                        t1 = t1+1
                        t = t+1
                    if (t<288 and t1<6):
                        for t2 in range (t0,t+1):
                            RR_seg[t2,i,j] = 1
                else :
                    t = t+1
    return RR_seg, RR

In [19]:
# each (i, j) in a matrix is either 1 if an event is in progress, or 0 if there is no event. We want to compute the length of events over the whole year.
# the result is a matrix of the same size as the input, where each (i, j) is a list of tuples (time of event, length of event). Each tuple represent an event at (i,j)
def event_length(year, month, day):
    RR_seg, RR = segmentation_evenements(year, month, day)
    length = np.zeros([300,300], dtype=object)
    for i in range(300):
        for j in range(300):
            t = 0
            events = []
            while t < 288:
                if RR_seg[t, i, j] == 1:
                    t0 = t
                    while t < 288 and RR_seg[t, i, j] == 1:
                        t += 1
                    if t - t0 - 1 > 0:
                        events.append((t0, t - t0 - 1)) # because we don't want the final 0 value in the event 
                t += 1
            length[i, j] = events
    return length, RR


In [12]:
# for a given event, we want to compute the maximum intensity, the mean intensity, its variance, and the percentage of null values
def event_stats(RR, year, month, day, i, j, start_time, duration):
    RR_event = RR[start_time:start_time+duration, i, j]
    RR_event = RR_event[~np.isnan(RR_event)]
    return (duration, np.max(RR_event), np.mean(RR_event), np.var(RR_event), 1 - np.count_nonzero(RR_event) / duration)

In [14]:
# iterate through all data/2018 files and compute the stats for each event in each file
# create a dataframe with the results
# columns : year, month, day, i, j, start_time, duration_a, max_intensity, mean_intensity, variance, percentage_null

def get_date_from_file(file):
    match = re.search(r'(\d{4})(\d{2})(\d{2})', file)
    return (int(match.group(1)), int(match.group(2)), int(match.group(3)))

# give me the list of files

files = os.listdir('data/2018')
dates = [get_date_from_file(file) for file in files] # tuples (year, month, day) for each file

In [15]:
def get_matrix(year, month, day):
    path = 'data/'+'{:04d}/'.format(year)
    file = 'RR_IDF300x300_{:04d}{:02d}{:02d}.npy'.format(year,month,day)
    file = path + file
    RR = np.load(file)/100.0
    RR[RR < 0]=np.nan
    return RR

In [59]:
max_count = 300*300
count = 0

df = pd.DataFrame(columns=['year', 'month', 'day', 'i', 'j', 'start_time_relative', 'start_time_absolute', 'duration', 'max_intensity', 'mean_intensity', 'variance', 'percentage_null'])
rows = []

for date in dates[:31]: # calcule les features de tous les événements pour le mois de janvier
    # loading bar
    year, month, day = date
    events, RR = event_length(year, month, day)
    for i in range(300):
        for j in range(300):
            print(f'{count}/{max_count}')
            for event in events[i, j]:
                start_time = event[0]
                duration = event[1]
                stats = event_stats(RR, year, month, day, i, j, start_time, duration)
                rows.append({'year': year, 'month': month, 'day': day, 'i': i, 'j': j, 'start_time_relative': start_time, 'start_time_absolute': start_time + 288 * (day - 1), 'duration': duration, 'max_intensity': stats[1], 'mean_intensity': stats[2], 'variance': stats[3], 'percentage_null': stats[4]})
            count += 1

KeyboardInterrupt: 

In [50]:
df = pd.DataFrame(rows)
df.to_csv('.data/features/features_012018.csv')

In [51]:
df

Unnamed: 0,year,month,day,i,j,start_time_relative,start_time_absolute,duration,max_intensity,mean_intensity,variance,percentage_null
0,2018,1,1,0,0,0,0,2,2.52,1.320000,1.440000,0.000000
1,2018,1,1,0,0,4,4,7,1.44,0.411429,0.225698,0.142857
2,2018,1,1,0,0,25,25,6,3.84,1.718333,1.438147,0.000000
3,2018,1,1,0,0,39,39,1,0.12,0.120000,0.000000,0.000000
4,2018,1,1,0,0,57,57,8,1.68,0.585000,0.257175,0.125000
...,...,...,...,...,...,...,...,...,...,...,...,...
830252,2018,1,1,299,299,191,191,10,0.72,0.384000,0.059904,0.100000
830253,2018,1,1,299,299,203,203,6,1.68,0.600000,0.336000,0.166667
830254,2018,1,1,299,299,212,212,3,1.32,0.760000,0.166400,0.000000
830255,2018,1,1,299,299,229,229,2,0.72,0.600000,0.014400,0.000000
