In [29]:
"""
The purpose of this file is to take in the entire data for Ann Arbor Airport, resample and scale it to 0->1 range
and output it as a .csv file for the other notebooks to process further. 

The feature analysis notebook will extract the most meaningful features and remove multicollinearity

The data was collected via Climate data online, NOAA. We manually entered the power outage data in. 
"""
import pandas as pd
import pandas_ml as pdml
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.decomposition import PCA


In [30]:
#Load data
df = pd.read_csv('data.csv');
#Replace NaN with 0
df = df.fillna(0);
df['DATE'] = pd.to_datetime(df['DATE']);

In [31]:
#Add columns for wind squared and reorder the tables
df['Avg Wind Speed Squared'] = pd.Series(np.square(df['Average Wind Speed']), index=df.index)
df['5 second wind speed squared'] = pd.Series(np.square(df['Fastest 5 second wind speed']));
df['2 min wind speed squared'] = pd.Series(np.square(df['Fastest 2 minute wind speed']));
df = df[['Power Outage', 'PRCP', 'TMAX','TMIN','Fastest 5 second wind speed','5 second wind speed squared', 'Fastest 2 minute wind speed' ,'2 min wind speed squared','Average Wind Speed','Avg Wind Speed Squared','Fog/Ice', 'Heavy/Freezing Fog', 'Thunder','Smoke/Haze']];


In [32]:
#Under sample based on Cluster Centroids
df_ML = pdml.ModelFrame(df, target = 'Power Outage')
df_ML.target_name
print("Initial Counts");
print(df_ML.target.value_counts());
sampler = df_ML.imbalance.under_sampling.ClusterCentroids();
sampled = df_ML.fit_sample(sampler);
print("After sampling Counts:")
print(sampled.target.value_counts());
finalData = pd.DataFrame(sampled);
finalData.head()

Initial Counts
0.0    2751
1.0     159
Name: Power Outage, dtype: int64
After sampling Counts:
1.0    159
0.0    159
Name: Power Outage, dtype: int64


Unnamed: 0,Power Outage,PRCP,TMAX,TMIN,Fastest 5 second wind speed,5 second wind speed squared,Fastest 2 minute wind speed,2 min wind speed squared,Average Wind Speed,Avg Wind Speed Squared,Fog/Ice,Heavy/Freezing Fog,Thunder,Smoke/Haze
0,0.0,0.0,85.0,53.0,83.0,6889.0,32.0,1024.0,2.91,8.4681,1.0,1.0,0.0,1.0
1,0.0,0.087143,45.028571,26.228571,29.357143,862.007143,21.72,471.888,10.692857,115.096254,0.342857,0.028571,0.028571,0.228571
2,0.0,0.0,52.0,35.0,172.9,29894.41,36.9,1361.61,15.88,252.1744,0.0,0.0,0.0,0.0
3,0.0,0.46,62.0,38.0,136.0,18496.0,21.9,479.61,9.4,88.36,1.0,0.0,0.0,0.0
4,0.0,0.054737,75.263158,49.084211,21.483158,461.727474,15.868421,252.254211,5.952632,37.603141,0.326316,0.157895,0.052632,0.105263


In [33]:
#This was our initial approach, but it gave substantially less accuracy
# Random Split
#Split the data in two parts.
# powerOutages = df.loc[df['Power Outage'] == 1];
# noPowerOutages = df.loc[df['Power Outage'] == 0];

# zeroDays = noPowerOutages.sample(159);
# finalData = pd.concat([powerOutages,zeroDays]);
# finalData = finalData.sample(318);


In [34]:
#Scale down the data to a 0->1 range
scaledData = preprocessing.MinMaxScaler().fit_transform(finalData.values);
finalData = pd.DataFrame(scaledData, index = finalData.index, columns = finalData.columns);

In [35]:
finalData.to_csv('Clustered Data', index = False);