In [1]:
import pandas as pd
import inspect
import os
import numpy as np
dirname = os.path.dirname(os.path.abspath(inspect.stack()[0][1]))

TRAIN_DATA_PATH = "data/train.csv"
TEST_DATA_PATH = "data/test.csv"
is_test = True

def load_rain_data(data_path=TRAIN_DATA_PATH):
    data_path = os.path.join(dirname, data_path)
    return pd.read_csv(data_path)

### Extract training and test sets from the files

In [2]:
print("Extracting train data")
rain_train = load_rain_data(TRAIN_DATA_PATH)

n_total_samples_train = len(rain_train.groupby('Id'))
print(n_total_samples_train, " samples")

print("Extracting test data")
rain_test = load_rain_data(TEST_DATA_PATH)

n_total_samples_test = len(rain_test.groupby('Id'))
print(n_total_samples_test, " samples")

Extracting train data
1180945  samples
Extracting test data
717625  samples


### Cleaning the data

#### Calculate the median of Expected value for the training set 

In [3]:
median_expected = np.median(rain_train['Expected'])
print ("Median value training set:", median_expected)

Median value training set: 1.0160005


#### Complete the physical features for partial missing values per series, and  add a new column to each physical measure to keep the fact that there was a missing value

In [4]:
column_start = ('Ref', 'Kdp', 'Zdr', 'Rho')

for column in rain_test.columns:
    
    # Create new columns to keep the information that values were missing
    if column.startswith(column_start):
        rain_train[column + "_NA"] = pd.Series(rain_train[column].isnull(), index=rain_train.index)
        rain_test[column + "_NA"] = pd.Series(rain_test[column].isnull(), index=rain_test.index)
    
    # Filling the missing value on a series with the mean of other sample of the same series
    print("Column ",column, " Removing the partial N/A")
    rain_train[column].fillna(rain_train.groupby(['Id'])[column].transform("mean"), inplace=True)
    rain_test[column].fillna(rain_test.groupby(['Id'])[column].transform("mean"), inplace=True)

Column  Id  Removing the partial N/A
Column  minutes_past  Removing the partial N/A
Column  radardist_km  Removing the partial N/A
Column  Ref  Removing the partial N/A
Column  Ref_5x5_10th  Removing the partial N/A
Column  Ref_5x5_50th  Removing the partial N/A
Column  Ref_5x5_90th  Removing the partial N/A
Column  RefComposite  Removing the partial N/A
Column  RefComposite_5x5_10th  Removing the partial N/A
Column  RefComposite_5x5_50th  Removing the partial N/A
Column  RefComposite_5x5_90th  Removing the partial N/A
Column  RhoHV  Removing the partial N/A
Column  RhoHV_5x5_10th  Removing the partial N/A
Column  RhoHV_5x5_50th  Removing the partial N/A
Column  RhoHV_5x5_90th  Removing the partial N/A
Column  Zdr  Removing the partial N/A
Column  Zdr_5x5_10th  Removing the partial N/A
Column  Zdr_5x5_50th  Removing the partial N/A
Column  Zdr_5x5_90th  Removing the partial N/A
Column  Kdp  Removing the partial N/A
Column  Kdp_5x5_10th  Removing the partial N/A
Column  Kdp_5x5_50th  Re

#### Identify the lines with no values at all or with expected value bigger than a resonable value (730 mm per hour)

In [5]:
wrong_values_train = (pd.DataFrame(rain_train.isnull().sum(axis=1))[0]==20) | (rain_train['Expected']  > 730)
wrong_values_test = (pd.DataFrame(rain_test.isnull().sum(axis=1))[0]==20)

# Saving the sample indices that are considered useless

wrong_indices_train = rain_train[wrong_values_train].groupby('Id')['Id'].first().as_matrix()
print("Number of samples removed from training set:", len(wrong_indices_train))
wrong_indices_test = rain_test[wrong_values_test].groupby('Id')['Id'].first().as_matrix()
print("Number of samples removed from test set:", len(wrong_indices_test))

# Remove the wrong values rows from training and testing set

rain_train.drop(rain_train.index[wrong_values_train], inplace = True)
rain_test.drop(rain_test.index[wrong_values_test], inplace = True)

# Replace the missing values in the remaining samples with average features

print("Removing the N/A")
rain_train.fillna(rain_train.mean(), inplace=True)
rain_test.fillna(rain_test.mean(), inplace=True)

# Check that the sample were correctly split between wrong indices and correct indices
if (n_total_samples_train == len(rain_train.groupby('Id')) + len(wrong_indices_train)):
    print("Training set: correct operation")

if (n_total_samples_test == len(rain_test.groupby('Id')) + len(wrong_indices_test)):
    print("Test set: correct operation")

Number of samples removed from training set: 363237
Number of samples removed from test set: 232148
Removing the N/A
Training set: correct operation
Test set: correct operation


In [12]:
rain_train.describe()

Unnamed: 0,Id,minutes_past,radardist_km,Ref,Ref_5x5_10th,Ref_5x5_50th,Ref_5x5_90th,RefComposite,RefComposite_5x5_10th,RefComposite_5x5_50th,...,RhoHV_5x5_90th,Zdr,Zdr_5x5_10th,Zdr_5x5_50th,Zdr_5x5_90th,Kdp,Kdp_5x5_10th,Kdp_5x5_50th,Kdp_5x5_90th,Expected
count,10147600.0,10147600.0,10147600.0,10147600.0,10147600.0,10147600.0,10147600.0,10147600.0,10147600.0,10147600.0,...,10147600.0,10147600.0,10147600.0,10147600.0,10147600.0,10147600.0,10147600.0,10147600.0,10147600.0,10147600.0
mean,592540.5,29.53674,10.02512,21.64839,18.72079,21.42921,24.52981,23.19889,20.71512,23.04135,...,1.015786,0.5967037,-0.8102434,0.2171537,2.008086,0.0566148,-3.723776,-0.8694066,3.839131,8.49097
std,340738.3,17.30655,4.013964,9.261549,7.620667,8.930816,10.66492,9.768875,8.233327,9.446,...,0.04890237,1.265262,0.767989,0.7833905,1.446703,3.205379,2.140965,2.036347,3.690846,46.48318
min,2.0,0.0,0.0,-31.0,-32.0,-32.0,-28.5,-32.0,-31.0,-27.5,...,0.2083333,-7.875,-7.875,-7.875,-7.875,-96.04,-80.79,-78.77,-100.2,0.01
25%,296052.0,15.0,7.0,15.9,14.66667,16.0,17.0,16.75,16.0,17.0,...,0.99875,0.09375,-0.875,0.0,1.25,-0.5250015,-3.723776,-0.8694066,2.470001,0.2540001
50%,593165.0,30.0,10.0,21.64839,18.72079,21.42921,24.5,23.19889,20.71512,23.04135,...,1.015786,0.5967037,-0.8102434,0.2171537,2.008086,0.0566148,-3.723776,-0.8694066,3.839131,1.016
75%,890731.0,44.0,13.0,26.75,22.0,26.5,31.5,28.6875,24.5,28.5,...,1.051667,0.8125,-0.5625,0.375,2.303571,0.6999969,-2.820007,0.0,4.580002,3.302002
max,1180945.0,59.0,21.0,71.0,62.5,69.0,72.5,92.5,66.0,71.0,...,1.051667,7.9375,7.9375,7.9375,7.9375,179.75,3.519989,12.8,144.6,993.9025


### Prepare the data by regrouping measurements with the same id

#### Calculating the dimensions of cleaned sets

In [6]:
#We have to make the assumption that the test set has no more steps than the training set
n_steps = rain_train.groupby(['Id']).size().max()
n_samples_train = rain_train.groupby(['Id']).size().shape[0]
n_samples_test = rain_test.groupby(['Id']).size().shape[0]

# Number of features, not counting the Id column
n_inputs = len(rain_test.columns) - 1     
  
print("Max number of steps: ", n_steps)
print("Number of samples", n_samples_train, n_samples_test)
print("Number of inputs: ", n_inputs)

Max number of steps:  19
Number of samples 817708 485477
Number of inputs:  42


### Creating the 3D data sets

In [7]:
indices_train = rain_train.groupby(['Id'])['Id'].first().as_matrix()
indices_test = rain_test.groupby(['Id'])['Id'].first().as_matrix()

y_train = rain_train.groupby(['Id'])['Expected'].first().as_matrix()
rain_train.drop(['Expected'], axis = 1, inplace = True)

In [9]:
# For each sample, create a plane n_steps * n_inputs and padded with 0

seq_length_train = np.zeros(n_samples_train)

X_train = np.zeros((n_samples_train, n_steps, n_inputs))

# Treating the training set
print("Creating the training set...")

i=0
for name, group in rain_train.groupby(['Id']):
    
    # Seems useless are data is already sorted
    #group.sort_values(by=['minutes_past'], inplace = True)
    
    # Remove the Id Column
    A_train = group.drop('Id', axis=1).as_matrix()
    
    # Extract the number of rows for this sample: sequence length
    seq_length_train[i] = A_train.shape[0]
    # Padding with 0 to complete the sequence 
    X_train[i,:,:] = np.pad(A_train, ((0, n_steps - A_train.shape[0]), (0,0)), 'constant')

    if (i%100000==0):
        print("Treating - step: ",i)
    i = i+1
    
del rain_train

Creating the training set...
Treating - step:  0
Treating - step:  100000
Treating - step:  200000
Treating - step:  300000
Treating - step:  400000
Treating - step:  500000
Treating - step:  600000
Treating - step:  700000
Treating - step:  800000


In [10]:
X_test = np.zeros((n_samples_test, n_steps, n_inputs))
seq_length_test = np.zeros(n_samples_test)

# Treating the test set
print("Creating the test set...")

i=0
for name, group in rain_test.groupby(['Id']):

    # Seems useless are data is already sorted
    #group.sort_values(by=['minutes_past'], inplace = True)

    # Remove the Id Column
    A_test = group.drop('Id', axis=1).as_matrix()
        
    # Extract the sequence length
    seq_length_test[i] = A_test.shape[0]
    
    # Padding with 0 to complete the sequence
    X_test[i,:,:] = np.pad(A_test, ((0, n_steps - A_test.shape[0]), (0,0)), 'constant')

    if (i%100000==0):
        print("Treating - step: ",i)
    i = i+1
    
del rain_test

Creating the test set...
Treating - step:  0
Treating - step:  100000
Treating - step:  200000
Treating - step:  300000
Treating - step:  400000


#### Split the training set into train and eval sets and save

In [11]:
eval_ratio = 0.2
shuffled_idx = np.random.permutation(n_samples_train)
train_size = int(n_samples_train * (1 - eval_ratio))

np.save("WIR2-X-train-7.npy", X_train[shuffled_idx[:train_size]])
np.save("WIR2-y-train-7.npy", y_train[shuffled_idx[:train_size]])
np.save("WIR2-indices-train-7.npy", indices_train[shuffled_idx[:train_size]])
np.save("WIR2-seq_length-train-7.npy", seq_length_train[shuffled_idx[:train_size]])

np.save("WIR2-X-eval-7.npy", X_train[shuffled_idx[train_size:]])
np.save("WIR2-y-eval-7.npy", y_train[shuffled_idx[train_size:]])
np.save("WIR2-indices-eval-7.npy", indices_train[shuffled_idx[train_size:]])
np.save("WIR2-seq_length-eval-7.npy", seq_length_train[shuffled_idx[train_size:]])

In [12]:
# Saving the test set

np.save("WIR2-X-test-7.npy", X_test)
np.save("WIR2-indices-test-7.npy", indices_test)
np.save("WIR2-seq_length-test-7.npy", seq_length_test)
np.save("WIR2-wrong_indices-test-7.npy", wrong_indices_test)