In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from scipy.stats import ks_2samp

In [2]:
input_file = '../data/training_data/1992-2015_training_data_raw.csv'

dtypes={
    'air.sfc': float,
    'air.2m': float,
    'apcp': float,
    'crain': float,
    'rhum.2m': float,
    'dpt.2m': float,
    'pres.sfc': float,
    'uwnd.10m': float,
    'vwnd.10m': float,
    'veg': float,
    'prate': float,
    'vis': float,
    'lat': float,
    'lon': float,
    'weather_bin_month': int,
    'weather_bin_year': int,
    'ignition': float
}

weather_variables = [
    'weather_bin_month',
    'weather_bin_year',
    'air.sfc',
    'rhum.2m',
    'dpt.2m',
    'pres.sfc',
    'uwnd.10m', 
    'vwnd.10m',
    'veg',
    'lat',
    'lon',
    'ignition'
]

# Distribution plotting variables

left  = 0.125  # the left side of the subplots of the figure
right = 0.65   # the right side of the subplots of the figure
bottom = 0.1   # the bottom of the subplots of the figure
top = 0.9      # the top of the subplots of the figure
wspace = 0.2   # the amount of width reserved for blank space between subplots
hspace = 0.3   # the amount of height reserved for white space between subplots

fig_rows = 4
fig_cols = 3
plot_height = 30
plot_width = 30

plot_locations = [
    (0,0),(0,1),(0,2),
    (1,0),(1,1),(1,2),
    (2,0),(2,1),(2,2),
    (3,0),(3,1)
]

data_types = [
    'weather_bin_month',
    'weather_bin_year',
    'air.sfc',
    'rhum.2m',
    'dpt.2m',
    'pres.sfc',
    'uwnd.10m', 
    'vwnd.10m',
    'veg',
    'lat',
    'lon'
]

plot_titles = [
    'Month',
    'Year',
    'Surface air temperature',
    'Relative humidity',
    'Dew point',
    'Surface pressure',
    'U-component of wind', 
    'V-component of wind',
    'Vegitation coverage',
    'Latitude',
    'Longitude'
]

weather_variable_labels = [
    'Month',
    'Year',
    'Temp.',
    'Humidity',
    'Dew point',
    'Pressure',
    'Speed',
    'Speed',
    'Percent',
    'Degrees',
    'Degrees'
]

y_scales = [
    'linear',
    'linear',
    'linear',
    'linear',
    'linear',
    'linear',
    'linear',
    'linear',
    'linear',
    'linear',
    'linear'
]

xlabels = weather_variable_labels
ylabels = ['Fraction observations'] * len(xlabels)

In [6]:
def recursive_sample(df, target_n, samples, p_val_cutoff):
    
    # base condition
    if len(df) < target_n:
        samples.append(df)
    
    # recursion condition
    else:
        sample_size = int(len(df) // 2)
        
        while True:
            left_sample, right_sample = random_split(df, sample_size)
            ks_pvals = ks_test(left_sample, right_sample, weather_variables)
            
            if all(p_vals >= p_val_cutoff for p_vals in ks_pvals):
                print("Good split on sample size {}".format(sample_size))
                break
            
            print("Rejected split on sample size {}".format(sample_size))
            
        recursive_sample(left_sample, target_n, samples, p_val_cutoff)
        recursive_sample(right_sample, target_n, samples, p_val_cutoff)
    
    return samples

def random_split(df, sample_size):
    df_size = len(df)
    indices = random.sample(range(0, df_size), df_size)
    left_indicies = indices[sample_size:]
    right_indicies = indices[:sample_size]
    
    return df.iloc[left_indicies], df.iloc[right_indicies]

def ks_test(left_sample, right_sample, weather_variables):
    
    ks_pvals = []

    for variable in weather_variables:
        left = np.array(left_sample[variable])
        right = np.array(right_sample[variable])

        ks_result = ks_2samp(left, right)
        ks_pvals.append(ks_result[1])
    
    return ks_pvals

def two_sample_density_plot(
    plot_location, 
    parent_data, 
    sample_data, 
    data_type, 
    title, 
    xlabel, 
    ylabel, 
    y_scale
):
    parent_values, parent_base = np.histogram(parent_data[data_type], bins=40)
    sample_values, sample_base = np.histogram(sample_data[data_type], bins=40)
    
    parent_cumulative = np.cumsum(parent_values) / len(parent_data)
    sample_cumulative = np.cumsum(sample_values) / len(sample_data)

    ax[plot_location].plot(parent_base[:-1], (parent_values/len(parent_data)))
    ax[plot_location].plot(sample_base[:-1], (sample_values/len(sample_data)))
    ax[plot_location].tick_params(labelsize=12)
    ax[plot_location].set_title(title, fontsize=18)
    ax[plot_location].set_xlabel(xlabel, fontsize=14)
    ax[plot_location].set_ylabel(ylabel, fontsize=15)
    ax[plot_location].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
    ax[plot_location].set_yscale(y_scale)

In [4]:
master_df = pd.read_csv(input_file, dtype=dtypes)
master_df.drop(columns=[
      'air.2m', 
      'apcp',
      'crain',
      'rhum.2m',
      'prate',
      'vis'
], inplace=True)

In [None]:
input_file = '../data/training_data/1992-2015_training_data_raw.csv'
output_file_base_name = "../data/stratified_training_data/1992-2015_training_data_raw_"

samples = []
target_n = 100000
p_val_cutoff = 0.3

samples = recursive_sample(master_df, target_n, samples, p_val_cutoff)

In [None]:
i = 1
for sample in samples:
    output_file_name = output_file_base_name+"n"+str(target_n)+"_ks_pval"+str(p_val_cutoff)+"."str(i)+".csv"
    sample.to_csv(output_file_name, index=False)

In [None]:
# Make density plots
fig, ax = plt.subplots(fig_rows, fig_cols, figsize=(plot_width, plot_height))
fig.subplots_adjust(left=left, bottom=bottom, right=right, top=top, wspace=wspace, hspace=hspace)

for i in range(len(plot_locations)):
    ax[plot_locations[i]] = two_sample_density_plot(plot_locations[i], master_df, samples[1], data_types[i], plot_titles[i], xlabels[i], ylabels[i], y_scales[i])
    
plt.show()