## Import Statements

In [15]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

import random

r = 1337
np.random.seed(r)
random.seed(r)

## Read in Data

In [16]:
train_test = pd.read_json('data/train-test.json')
labels = train_test[['label']]
train_test = train_test.drop('label', axis='columns')

### Split Data

In [26]:
# We can only look at the statistics of the training set

train_size=0.75

X_train, X_test, y_train, y_test = train_test_split(
    train_test.values,
    labels.values.ravel(),
    train_size=train_size,
    shuffle=True,
    random_state=r,
    stratify=labels.values.ravel()
)

imp = IterativeImputer(max_iter=100, random_state=r)

X_train = imp.fit_transform(X_train)

train_set = pd.DataFrame(X_train, columns=train_test.columns)
train_labels = pd.DataFrame(y_train, columns=['labels'])
train_set.insert(loc=0, column='labels', value=train_labels)

In [27]:
train_set.head()

Unnamed: 0,labels,turbidity,air_temp,rel_hum,wind_speed,chlor,phycocyanin,do_raw,do_sat,do_wtemp,...,sin_wind_dir,DAILYPrecip_one_day,DAILYPrecip_three_day,DAILYPrecip_one_week,COUNTPrecip_one_day,COUNTPrecip_three_day,COUNTPrecip_one_week,algalBloomSheen_one_day,algalBloomSheen_three_day,algalBloomSheen_one_week
0,0,76.5,18.194639,85.623812,6.647729,1333.814634,727.461111,8.407875,100.761181,24.462236,...,-0.54235,1.52,1.86,1.86,1.0,2.0,2.0,0.0,0.0,0.0
1,0,44.0,21.869322,73.530964,6.514962,1976.1174,716.08798,7.744528,84.316171,19.481965,...,-0.980666,0.0,0.0,0.01,0.0,0.0,1.0,0.0,0.0,0.0
2,1,57.18,22.036615,72.450965,3.860514,5079.430556,1037.593333,12.263011,143.003298,22.899222,...,0.107657,0.0,0.03,1.6,0.0,1.0,5.0,1.0,3.0,6.0
3,0,117.142857,25.435972,79.110257,4.278528,1131.573611,690.301667,8.287181,100.2425,23.260826,...,-0.553846,0.47,2.11,3.19,1.0,3.0,5.0,1.0,1.0,1.0
4,0,88.48598,12.654402,65.33677,3.610094,6524.554788,597.915157,13.355924,119.668294,9.075564,...,0.007835,0.0,0.89,1.68,0.0,2.0,6.0,0.0,0.0,0.0


## Get Statistics for Each Class

In [35]:
pos = train_set.loc[train_set['labels'] == 1]    # dataframe of postive examples
neg = train_set.loc[train_set['labels'] == 0]    # dataframe of negative examples

In [44]:
stats = ['min', 'max', 'mean', 'median', 'std']   # statistics of interest for each class

pos_stat = np.zeros((len(stats), len(train_set.columns)))
neg_stat = np.zeros((len(stats), len(train_set.columns)))

for c in ['pos', 'neg']:
    if c == 'pos':
        df = pos
        stat = pos_stat
    else:
        df = neg
        stat = neg_stat
    for col in range(len(train_set.columns)):
        stat[:, col] = [
            np.min(df.iloc[:, col]),
            np.max(df.iloc[:, col]),
            np.mean(df.iloc[:, col]),
            np.median(df.iloc[:, col]),
            np.std(df.iloc[:, col]),
        ]
        
pos_stat_df = pd.DataFrame(pos_stat, index=stats, columns=train_set.columns)
neg_stat_df = pd.DataFrame(neg_stat, index=stats, columns=train_set.columns)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print('Positive Class Statistics')
    display(pos_stat_df)
    print('\n\nNegative Class Statistics')
    display(neg_stat_df)
    display((pos_stat_df-neg_stat_df).abs())

Positive Class Statistics


Unnamed: 0,labels,turbidity,air_temp,rel_hum,wind_speed,chlor,phycocyanin,do_raw,do_sat,do_wtemp,pco2_ppm,par,par_below,DAILYMaximumDryBulbTemp,DAILYMinimumDryBulbTemp,DAILYDeptFromNormalAverageTemp,DAILYAverageStationPressure,cos_month,sin_month,cos_wind_dir,sin_wind_dir,DAILYPrecip_one_day,DAILYPrecip_three_day,DAILYPrecip_one_week,COUNTPrecip_one_day,COUNTPrecip_three_day,COUNTPrecip_one_week,algalBloomSheen_one_day,algalBloomSheen_three_day,algalBloomSheen_one_week
min,1.0,33.0,8.471084,46.308348,1.019493,637.108264,380.548056,6.666889,78.837326,6.360753,55.563238,84.368958,-58.112548,51.0,32.0,-12.5,28.46,-1.0,-1.0,-0.999929,-0.999621,0.0,-8.465451e-16,-9.35016e-16,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,120.0,27.162569,94.190174,7.45241,12023.658333,2884.347601,19.885097,196.1425,27.455063,492.842639,1066.721304,453.326667,95.0,75.0,20.1,29.34,-1.83697e-16,0.8660254,0.999487,0.999992,3.78,3.92,6.8,1.0,3.0,7.0,1.0,3.0,7.0
mean,1.0,90.178346,21.049978,72.702368,3.263397,2421.319896,905.942628,10.819052,124.401148,21.735282,191.821763,490.155268,105.85214,78.284163,57.346154,1.780502,29.047923,-0.7597336,-0.262495,0.036285,0.034344,0.17,0.4683077,1.142769,0.430769,1.330769,3.2,0.615385,1.776923,4.053846
median,1.0,92.251627,21.777986,72.220264,3.173291,1442.252247,789.615451,10.78105,124.864028,22.450467,170.675935,493.982014,90.983359,79.0,59.0,0.75,29.05,-0.8660254,1.224647e-16,0.022242,0.05322,0.0,0.175,0.77,0.0,1.0,3.0,1.0,2.0,4.5
std,0.0,20.112602,3.677636,9.589446,1.30377,2306.040867,408.912489,2.01114,21.428531,3.952808,101.037077,161.32672,83.575997,8.392535,8.772081,6.628948,0.130924,0.3218292,0.500327,0.627714,0.677714,0.457539,0.6964185,1.207039,0.495184,1.018207,1.642699,0.486504,1.076071,2.072945




Negative Class Statistics


Unnamed: 0,labels,turbidity,air_temp,rel_hum,wind_speed,chlor,phycocyanin,do_raw,do_sat,do_wtemp,pco2_ppm,par,par_below,DAILYMaximumDryBulbTemp,DAILYMinimumDryBulbTemp,DAILYDeptFromNormalAverageTemp,DAILYAverageStationPressure,cos_month,sin_month,cos_wind_dir,sin_wind_dir,DAILYPrecip_one_day,DAILYPrecip_three_day,DAILYPrecip_one_week,COUNTPrecip_one_day,COUNTPrecip_three_day,COUNTPrecip_one_week,algalBloomSheen_one_day,algalBloomSheen_three_day,algalBloomSheen_one_week
min,0.0,35.0,5.294792,36.113974,1.038654,524.991208,349.353958,0.0,0.0,5.338458,49.554194,4.625418,-5.569237,44.0,26.0,-15.7,28.62,-1.0,-1.0,-0.999993,-0.999851,0.0,-8.881784e-16,-2.220446e-16,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,120.584917,27.478125,95.884583,7.908236,16211.541667,1264.133333,16.401681,173.155278,27.535035,1249.098025,925.902139,404.170053,92.0,72.0,22.7,29.47,0.5,0.866025,0.999977,1.0,2.74,3.22,4.79,1.0,3.0,7.0,1.0,3.0,7.0
mean,0.0,90.605846,18.794235,71.778401,3.882032,2854.615292,679.935923,10.027511,108.942047,19.792162,261.837152,443.738111,104.244823,74.756972,54.223108,1.626295,29.07251,-0.540821,-0.315941,-0.031738,0.027074,0.147211,0.4745817,1.130279,0.50996,1.482072,3.36255,0.163347,0.573705,1.442231
median,0.0,89.331716,20.189792,71.225833,3.686833,1811.426389,672.31505,9.827264,108.793681,22.440325,244.324931,445.494028,88.860625,77.0,55.0,1.3,29.07,-0.5,-0.5,-0.021344,0.034169,0.01,0.1,1.04,1.0,1.0,3.0,0.0,0.0,1.0
std,0.0,20.387647,5.235588,9.925587,1.447922,2691.190468,165.831144,2.148292,18.830975,5.936913,158.332953,152.715013,82.489179,10.241346,10.358803,6.6513,0.147644,0.389532,0.675247,0.659912,0.692559,0.361574,0.7048535,1.031444,0.499901,1.01908,1.604287,0.369682,0.868636,1.885056


Unnamed: 0,labels,turbidity,air_temp,rel_hum,wind_speed,chlor,phycocyanin,do_raw,do_sat,do_wtemp,pco2_ppm,par,par_below,DAILYMaximumDryBulbTemp,DAILYMinimumDryBulbTemp,DAILYDeptFromNormalAverageTemp,DAILYAverageStationPressure,cos_month,sin_month,cos_wind_dir,sin_wind_dir,DAILYPrecip_one_day,DAILYPrecip_three_day,DAILYPrecip_one_week,COUNTPrecip_one_day,COUNTPrecip_three_day,COUNTPrecip_one_week,algalBloomSheen_one_day,algalBloomSheen_three_day,algalBloomSheen_one_week
min,1.0,2.0,3.176293,10.194373,0.019161,112.117056,31.194097,6.666889,78.837326,1.022295,6.009045,79.74354,52.543311,7.0,6.0,3.2,0.16,0.0,0.0,6.4e-05,0.00023,0.0,4.1633360000000003e-17,7.129713e-16,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,0.584917,0.315556,1.69441,0.455826,4187.883333,1620.214268,3.483417,22.987222,0.079972,756.255386,140.819165,49.156613,3.0,3.0,2.6,0.13,0.5,0.0,0.00049,7e-06,1.04,0.7,2.01,0.0,0.0,0.0,0.0,0.0,0.0
mean,1.0,0.4275,2.255743,0.923968,0.618635,433.295397,226.006705,0.791542,15.4591,1.94312,70.015389,46.417156,1.607317,3.527191,3.123046,0.154207,0.024587,0.218912,0.053446,0.068023,0.00727,0.022789,0.006273981,0.01249035,0.079191,0.151302,0.16255,0.452038,1.203218,2.611615
median,1.0,2.91991,1.588194,0.994431,0.513543,369.174141,117.300401,0.953786,16.070347,0.010142,73.648995,48.487986,2.122734,2.0,4.0,0.55,0.02,0.366025,0.5,0.043586,0.019052,0.01,0.075,0.27,1.0,0.0,0.0,1.0,2.0,3.5
std,0.0,0.275045,1.557951,0.336142,0.144152,385.149601,243.081344,0.137152,2.597556,1.984105,57.295876,8.611707,1.086817,1.848811,1.586722,0.022351,0.016721,0.067703,0.17492,0.032198,0.014845,0.095965,0.008435034,0.1755946,0.004717,0.000873,0.038412,0.116823,0.207436,0.187889
