In [1]:
%matplotlib widget
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 
import datetime
import re
from mlxtend.preprocessing import minmax_scaling
import pickle
import math
from data_functions import *
from sklearn import preprocessing

In [2]:
solar_data_filename = '/Users/brad/Desktop/CS6620/Project/Data/Sample.csv' 
weather_data_filename = '/Users/brad/Desktop/CS6620/Project/Data/correct_weather.csv' 

In [3]:
solarData = pd.read_csv(solar_data_filename, sep=',')

In [4]:
solarData.columns

Index(['Unnamed: 0', 'TIME', 'Solar_power', 'Meter_value'], dtype='object')

In [5]:
solarData.head()

Unnamed: 0.1,Unnamed: 0,TIME,Solar_power,Meter_value
0,0,2019-07-11 13:11:55,-536674100.0,5.342115
1,1,2019-07-11 13:13:30,-536580800.0,5.352649
2,2,2019-07-11 13:13:38,-536866700.0,5.352649
3,3,2019-07-11 13:13:45,-536960300.0,5.352649
4,4,2019-07-11 13:13:53,-537152600.0,5.352649


In [6]:
# times = datetime.datetime.strptime(solarData['TIME'][0], '%Y-%m-%d %H:%M:%S').time()

In [7]:
# Seperate date and time using datetime from pandas 
solarData['Date']= pd.to_datetime(solarData['TIME']).dt.date
solarData['Time']= pd.to_datetime(solarData['TIME']).dt.time

In [8]:
# We Don't need the TIME or meter_value anymore 
solarData = solarData.drop(['TIME', 'Meter_value', 'Unnamed: 0'], axis=1)
solarData.head()

Unnamed: 0,Solar_power,Date,Time
0,-536674100.0,2019-07-11,13:11:55
1,-536580800.0,2019-07-11,13:13:30
2,-536866700.0,2019-07-11,13:13:38
3,-536960300.0,2019-07-11,13:13:45
4,-537152600.0,2019-07-11,13:13:53


In [9]:
# According to Shaju, from ASPIRE, negative values are corrupted
solarData['Solar_power'] = solarData['Solar_power'].apply(lambda val : val if val > 0 else math.nan)

In [10]:
solarData['Solar_power'] = solarData['Solar_power'].apply(lambda val : val if val < 60000 else math.nan)

In [11]:
plt.figure()
plt.hist(solarData['Solar_power'], bins=30)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [12]:
solarData.head()

Unnamed: 0,Solar_power,Date,Time
0,,2019-07-11,13:11:55
1,,2019-07-11,13:13:30
2,,2019-07-11,13:13:38
3,,2019-07-11,13:13:45
4,,2019-07-11,13:13:53


In [13]:
print(solarData['Solar_power'].min())
print(solarData['Solar_power'].max())
print(solarData['Solar_power'].median())
print()
print(solarData['Solar_power'].mean())
print(solarData['Solar_power'].var())
print(solarData['Solar_power'].std())

740.6687727956357
59999.84153846734
17083.460421130927

20999.818438253504
222476006.26398286
14915.629596633957


In [14]:
len( solarData['Solar_power'] )

1375826

In [15]:
len( solarData['Date'] )

1375826

In [16]:
# Plot a histogram of individual generation to get a feel for the distribution
x = np.arange(1375826)
plt.figure()
plt.hist(solarData['Solar_power'], bins=30, edgecolor='black', linewidth=1)
plt.xlabel("power output (watts")
plt.ylabel("Number of occurences")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [17]:
# Now need to accumulate power per hour 
hour = -1
# This will contain a list of tuples where 0th index is date, 1st is hour of day (24 - hour based), 2nd is total generation per hour
date_hour_generation = [] 

# This is how many hours had at least 1 corrupted/garbage value 
numAdjusted = 0

# loop through all the data, 
# NaN values in an hour will be replaced with the average for the non-NaN values in the hour 
i = 0
while i < len(solarData['Solar_power']):
    hourTotal = 0
    numNaN = 0
    numTotal = 0
    hour = solarData['Time'][i].hour
    # get statistics for this single hour 
    while i < len(solarData['Solar_power']) and hour == solarData['Time'][i].hour:
        numTotal += 1 
        if math.isnan(solarData['Solar_power'][i]):
            numNaN += 1 
        else: 
            hourTotal += solarData['Solar_power'][i]
        i += 1
    # end while inner 
    if numNaN == 0:
        date_hour_generation.append((solarData['Date'][i -1], hour, hourTotal))
    elif numNaN == numTotal: # all values of an hour are garbage, skip 
        continue
    else:
        numAdjusted += 1 
        notNaN = numTotal - numNaN
        avg = hourTotal / notNaN
        hourTotal = avg * numTotal 
        date_hour_generation.append((solarData['Date'][i-1], hour, hourTotal))

# end while outer 



In [18]:
print(f'Number of hours adjusted: {numAdjusted}')
print(f'Total number of hours computed: {len(date_hour_generation)}')
print(f'Percentage of hours with some degree of adjustment: {( (numAdjusted / len(date_hour_generation) )* 100 ) :.2f}%')

Number of hours adjusted: 2513
Total number of hours computed: 5601
Percentage of hours with some degree of adjustment: 44.87%


In [19]:
# sData is the cumulated generation data frame 
sData = pd.DataFrame(date_hour_generation, columns=['Date','Hour', 'Generated'])
sData.tail()

Unnamed: 0,Date,Hour,Generated
5596,2021-03-08,17,883388.264795
5597,2021-03-08,18,535842.29323
5598,2021-03-09,8,528051.160382
5599,2021-03-09,9,557653.033407
5600,2021-03-09,10,407960.580239


In [20]:
sData.head()

Unnamed: 0,Date,Hour,Generated
0,2019-07-11,13,19005150.0
1,2019-07-11,14,25139280.0
2,2019-07-11,16,20209070.0
3,2019-07-11,17,15964920.0
4,2019-07-11,18,10714620.0


In [21]:
x = np.arange(5601)
plt.figure()
plt.hist(sData['Generated'], bins=30, edgecolor='black', linewidth=1)
plt.xlabel("power output")
plt.ylabel("Number of occurences")
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [22]:
print(sData['Generated'].min())
print(sData['Generated'].max())
print(sData['Generated'].median())
print()
print(sData['Generated'].mean())
print(sData['Generated'].var())
print(sData['Generated'].std())

6801.6095491037695
25778214.886890035
2844872.2990282103

4459052.625758316
20456529549647.016
4522889.51331414


In [23]:
weatherpf = pd.read_csv(weather_data_filename, sep=',')
weatherpf.columns

Index(['date_time', 'station_id', 'rh', 'eto', 'etr', 'rso', 'ea_avg',
       'precip', 'rh_max', 'rh_min', 'rh_tmn', 'rh_tmx', 'td_avg', 'td_max',
       'td_min', 'td_tmn', 'td_tmx', 'co2_avg', 'ppf_avg', 'solarmj',
       'wet_pct', 'airt_avg', 'airt_max', 'airt_min', 'airt_tmn', 'airt_tmx',
       'lwdn_avg', 'lwup_avg', 'pressure', 'swdn_avg', 'swup_avg', 'windd_sd',
       'windd_avg', 'winds_avg', 'winds_max', 'winds_tmx', 'netrad_avg',
       'fan_rmp_avg', 'visibilitymiles', 'batt_volt_min', 'surfacet1_avg',
       'surfacet2_avg', 'visibilitymiles_avg', 'pressurekpasealevel'],
      dtype='object')

In [24]:
weatherpf.head()

Unnamed: 0,date_time,station_id,rh,eto,etr,rso,ea_avg,precip,rh_max,rh_min,...,winds_max,winds_tmx,netrad_avg,fan_rmp_avg,visibilitymiles,batt_volt_min,surfacet1_avg,surfacet2_avg,visibilitymiles_avg,pressurekpasealevel
0,2019-07-11 00:59:59,1279257,30,0.002,0.003,0.0,0.741,0.0,31,29,...,18.119,11JUL2019:00:47:39,-63.37,4252,19.884,11.79,60.8,60.3,19.822,101.6
1,2019-07-11 01:59:59,1279257,31,0.002,0.003,0.0,0.732,0.0,32,30,...,13.198,11JUL2019:01:56:17,-61.05,4263,19.884,11.79,58.5,58.3,19.822,101.6
2,2019-07-11 02:59:59,1279257,32,0.001,0.002,0.0,0.716,0.0,33,30,...,17.448,11JUL2019:02:41:18,-61.61,4275,19.884,11.82,58.3,58.1,19.822,101.6
3,2019-07-11 03:59:59,1279257,32,0.002,0.002,0.0,0.703,0.0,33,30,...,16.553,11JUL2019:03:02:49,-60.99,4241,19.635,11.78,57.6,57.4,19.822,101.6
4,2019-07-11 04:59:59,1279257,32,0.001,0.002,0.0,0.686,0.0,33,30,...,14.987,11JUL2019:04:52:07,-60.79,4231,19.884,11.81,56.8,56.8,19.884,101.7


In [25]:
# we can't feed the date strings into our models, lets just get rid of them
# the data is all from the same weather station. We don't need that either 
toDrop = ['station_id', 'rh_tmn', 'rh_tmx', 'td_tmn', 'td_tmx', 'airt_tmn', 'airt_tmx', 'winds_tmx']
weatherpf = weatherpf.drop(toDrop, axis=1)
weatherpf.head()

Unnamed: 0,date_time,rh,eto,etr,rso,ea_avg,precip,rh_max,rh_min,td_avg,...,winds_avg,winds_max,netrad_avg,fan_rmp_avg,visibilitymiles,batt_volt_min,surfacet1_avg,surfacet2_avg,visibilitymiles_avg,pressurekpasealevel
0,2019-07-11 00:59:59,30,0.002,0.003,0.0,0.741,0.0,31,29,36.9,...,5.369,18.119,-63.37,4252,19.884,11.79,60.8,60.3,19.822,101.6
1,2019-07-11 01:59:59,31,0.002,0.003,0.0,0.732,0.0,32,30,36.5,...,3.803,13.198,-61.05,4263,19.884,11.79,58.5,58.3,19.822,101.6
2,2019-07-11 02:59:59,32,0.001,0.002,0.0,0.716,0.0,33,30,36.0,...,4.698,17.448,-61.61,4275,19.884,11.82,58.3,58.1,19.822,101.6
3,2019-07-11 03:59:59,32,0.002,0.002,0.0,0.703,0.0,33,30,35.4,...,4.25,16.553,-60.99,4241,19.635,11.78,57.6,57.4,19.822,101.6
4,2019-07-11 04:59:59,32,0.001,0.002,0.0,0.686,0.0,33,30,34.9,...,4.026,14.987,-60.79,4231,19.884,11.81,56.8,56.8,19.884,101.7


In [26]:
# Seperate date and time using datetime from pandas 
weatherpf['Date']= pd.to_datetime(weatherpf['date_time']).dt.date
weatherpf['Hour']= pd.to_datetime(weatherpf['date_time'][:]).dt.hour

In [27]:
weatherpf.head()

Unnamed: 0,date_time,rh,eto,etr,rso,ea_avg,precip,rh_max,rh_min,td_avg,...,netrad_avg,fan_rmp_avg,visibilitymiles,batt_volt_min,surfacet1_avg,surfacet2_avg,visibilitymiles_avg,pressurekpasealevel,Date,Hour
0,2019-07-11 00:59:59,30,0.002,0.003,0.0,0.741,0.0,31,29,36.9,...,-63.37,4252,19.884,11.79,60.8,60.3,19.822,101.6,2019-07-11,0
1,2019-07-11 01:59:59,31,0.002,0.003,0.0,0.732,0.0,32,30,36.5,...,-61.05,4263,19.884,11.79,58.5,58.3,19.822,101.6,2019-07-11,1
2,2019-07-11 02:59:59,32,0.001,0.002,0.0,0.716,0.0,33,30,36.0,...,-61.61,4275,19.884,11.82,58.3,58.1,19.822,101.6,2019-07-11,2
3,2019-07-11 03:59:59,32,0.002,0.002,0.0,0.703,0.0,33,30,35.4,...,-60.99,4241,19.635,11.78,57.6,57.4,19.822,101.6,2019-07-11,3
4,2019-07-11 04:59:59,32,0.001,0.002,0.0,0.686,0.0,33,30,34.9,...,-60.79,4231,19.884,11.81,56.8,56.8,19.884,101.7,2019-07-11,4


In [28]:
type(weatherpf['Hour'][0])

numpy.int64

In [29]:
# The weather sample is taken at the last minute of the hour, so the hour is one off 
weatherpf['Hour'] = (weatherpf['Hour'] + 1 )% 24

we want to combine the generation data to the weather data at the same time
we are going to predict the previous hours' weather to predict the next hour 
i.e. weather[hour_i] will try to predict generation[hour_i]
A BETTER WAY TO DO THIS:
set the date_hour as the index and then just join the two df's

In [30]:
Generated = np.full((len(weatherpf), 1), math.nan)

In [31]:
print(len(sData))
print(len(Generated))


5601
14171


In [32]:

wi = 0 # weather index
notFound = 0
nf = []
genInd = 0 # generation index
# loop through all the generation

lastWI = 0
while genInd < len(sData):
    added = False
    wi = lastWI
    while wi < len(weatherpf) and not added:
        # hour / date matches! 
        if weatherpf['Date'][wi] == sData['Date'][genInd] and weatherpf['Hour'][wi] == sData['Hour'][genInd]:
            Generated[wi] = sData['Generated'][genInd]
            genInd += 1
            lastWI = wi
            added = True
        else:
            wi += 1   
    if not added:
        notFound += 1
        nf.append((sData['Date'][genInd], sData['Hour'][genInd]))
        # print((sData['Date'][genInd], sData['Hour'][genInd]))
        genInd += 1
      



In [33]:
print(len(nf))

155


In [34]:
weatherpf['Generated'] = Generated
weatherpf.head()

Unnamed: 0,date_time,rh,eto,etr,rso,ea_avg,precip,rh_max,rh_min,td_avg,...,fan_rmp_avg,visibilitymiles,batt_volt_min,surfacet1_avg,surfacet2_avg,visibilitymiles_avg,pressurekpasealevel,Date,Hour,Generated
0,2019-07-11 00:59:59,30,0.002,0.003,0.0,0.741,0.0,31,29,36.9,...,4252,19.884,11.79,60.8,60.3,19.822,101.6,2019-07-11,1,
1,2019-07-11 01:59:59,31,0.002,0.003,0.0,0.732,0.0,32,30,36.5,...,4263,19.884,11.79,58.5,58.3,19.822,101.6,2019-07-11,2,
2,2019-07-11 02:59:59,32,0.001,0.002,0.0,0.716,0.0,33,30,36.0,...,4275,19.884,11.82,58.3,58.1,19.822,101.6,2019-07-11,3,
3,2019-07-11 03:59:59,32,0.002,0.002,0.0,0.703,0.0,33,30,35.4,...,4241,19.635,11.78,57.6,57.4,19.822,101.6,2019-07-11,4,
4,2019-07-11 04:59:59,32,0.001,0.002,0.0,0.686,0.0,33,30,34.9,...,4231,19.884,11.81,56.8,56.8,19.884,101.7,2019-07-11,5,


In [35]:
# remove rows that don't have a generation value 
weatherpf.dropna(axis=0, inplace=True)
weatherpf.head()

Unnamed: 0,date_time,rh,eto,etr,rso,ea_avg,precip,rh_max,rh_min,td_avg,...,fan_rmp_avg,visibilitymiles,batt_volt_min,surfacet1_avg,surfacet2_avg,visibilitymiles_avg,pressurekpasealevel,Date,Hour,Generated
12,2019-07-11 12:59:59,20,0.027,0.031,3.518,0.968,0.0,25,15,43.5,...,4511,17.771,12.55,92.1,95.0,18.765,101.6,2019-07-11,13,19005150.0
13,2019-07-11 13:59:59,18,0.029,0.033,3.472,0.848,0.0,20,15,40.3,...,4490,12.676,12.55,91.0,94.8,18.765,101.5,2019-07-11,14,25139280.0
15,2019-07-11 15:59:59,16,0.027,0.031,2.782,0.842,0.0,18,13,40.1,...,4470,19.884,12.55,86.7,93.9,19.014,101.4,2019-07-11,16,20209070.0
16,2019-07-11 16:59:59,16,0.022,0.026,2.202,0.782,0.0,19,13,38.1,...,4469,19.884,12.53,85.1,88.9,19.573,101.3,2019-07-11,17,15964920.0
17,2019-07-11 17:59:59,23,0.017,0.02,1.534,0.826,0.0,32,14,39.6,...,4399,19.884,12.23,83.5,78.1,19.014,101.3,2019-07-11,18,10714620.0


## Add astral data to input


In [36]:
datetime.strptime(weatherpf['date_time'][12], "%Y-%m-%d %H:%M:%S")

datetime.datetime(2019, 7, 11, 12, 59, 59)

In [37]:
# use the class I wrote 
loganAstral = LoganAstral()
# list for elevation, azimuth, zenith 
ElAzZe = []

for dtstr in weatherpf['date_time']:
    dt = datetime.strptime(dtstr, "%Y-%m-%d %H:%M:%S")
    ElAzZe.append(loganAstral.computeElAzZe(dt))


In [38]:
elevationAzimuthZenith = np.array(ElAzZe)

In [39]:
weatherpf['Elevation'] = elevationAzimuthZenith[:, 0]
weatherpf['Azimuth'] = elevationAzimuthZenith[:, 1]
weatherpf['Zenith'] = elevationAzimuthZenith[:, 2] 

In [40]:
# we will drop the date, but lets keep the month, hour of day, and day of month
weatherpf['Month'] = pd.to_datetime(weatherpf['date_time']).dt.month
weatherpf['Day'] = pd.to_datetime(weatherpf['date_time']).dt.day

In [41]:
weatherpf.columns

Index(['date_time', 'rh', 'eto', 'etr', 'rso', 'ea_avg', 'precip', 'rh_max',
       'rh_min', 'td_avg', 'td_max', 'td_min', 'co2_avg', 'ppf_avg', 'solarmj',
       'wet_pct', 'airt_avg', 'airt_max', 'airt_min', 'lwdn_avg', 'lwup_avg',
       'pressure', 'swdn_avg', 'swup_avg', 'windd_sd', 'windd_avg',
       'winds_avg', 'winds_max', 'netrad_avg', 'fan_rmp_avg',
       'visibilitymiles', 'batt_volt_min', 'surfacet1_avg', 'surfacet2_avg',
       'visibilitymiles_avg', 'pressurekpasealevel', 'Date', 'Hour',
       'Generated', 'Elevation', 'Azimuth', 'Zenith', 'Month', 'Day'],
      dtype='object')

In [45]:
newColumnOrder = ['Month', 'Day', 'Hour', 'Elevation', 'Azimuth', 'Zenith', 'rh', 'eto', 'etr', 'rso', 'ea_avg', 'precip', 'rh_max',
       'rh_min', 'td_avg', 'td_max', 'td_min', 'co2_avg', 'ppf_avg', 'solarmj',
       'wet_pct', 'airt_avg', 'airt_max', 'airt_min', 'lwdn_avg', 'lwup_avg',
       'pressure', 'swdn_avg', 'swup_avg', 'windd_sd', 'windd_avg',
       'winds_avg', 'winds_max', 'netrad_avg', 'fan_rmp_avg',
       'visibilitymiles', 'batt_volt_min', 'surfacet1_avg', 'surfacet2_avg',
       'visibilitymiles_avg', 'pressurekpasealevel', 'Generated']
MainDf = weatherpf[newColumnOrder]

In [92]:
# mainDF = weatherpf.drop(['date_time', 'Date', 'Hour'],axis=1)

In [48]:
# seperate and Save data without scaling
truth = MainDf['Generated']
MainDf.drop('Generated', axis=1, inplace=True)
MainDf.dropna(axis=1, inplace=True)
dataNonScaled = MainDf.to_numpy()
truthNonScaled = truth.to_numpy()

data_filename = '/Users/brad/Desktop/CS6620/Project/Data/nonScaledDataV2.npy' 
truth_filename = '/Users/brad/Desktop/CS6620/Project/Data/nonScaledTruthV2.npy' 
np.save(data_filename, dataNonScaled)
np.save(truth_filename, truthNonScaled)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MainDf.dropna(axis=1, inplace=True)


In [95]:
# seperate and Save data without scaling
truth = mainDF['Generated']
mainDF.drop('Generated', axis=1, inplace=True)
mainDF.dropna(axis=1, inplace=True)
dataNonScaled = mainDF.to_numpy()
truthNonScaled = truth.to_numpy()

data_filename = '/Users/brad/Desktop/CS6620/Project/Data/nonScaledData.npy' 
truth_filename = '/Users/brad/Desktop/CS6620/Project/Data/nonScaledTruth.npy' 
np.save(data_filename, dataNonScaled)
np.save(truth_filename, truthNonScaled)


In [61]:
# scale by largest value in each column 
for c in mainDF.columns:
    cMax = mainDF[c].max()
    print(f'Max for {c} is {cMax}')
    mainDF[c] = mainDF[c]/cMax

Max for rh is 97
Max for eto is 0.029
Max for etr is 0.034
Max for rso is 3.653
Max for ea_avg is 1.978
Max for precip is 0.224
Max for rh_max is 98
Max for rh_min is 97
Max for td_avg is 63.1
Max for td_max is 65.3
Max for td_min is 61.5
Max for co2_avg is 586.1
Max for ppf_avg is 2154.0
Max for solarmj is 3.66
Max for wet_pct is 0
Max for airt_avg is 95.9
Max for airt_max is 97.7
Max for airt_min is 93.7
Max for lwdn_avg is 519.9
Max for lwup_avg is 458.4
Max for pressure is 87.3
Max for swdn_avg is 488.9
Max for swup_avg is 1086.0
Max for windd_sd is 91.0
Max for windd_avg is 360.0
Max for winds_avg is 7.382
Max for winds_max is 28.409
Max for netrad_avg is 754.1
Max for fan_rmp_avg is 5081
Max for visibilitymiles is 19.884
Max for batt_volt_min is 14.82
Max for surfacet1_avg is 100.2
Max for surfacet2_avg is 186.8
Max for visibilitymiles_avg is 19.884
Max for pressurekpasealevel is 103.1
Max for Generated is 25699219.94879516


In [62]:
mainDF.head()

Unnamed: 0,rh,eto,etr,rso,ea_avg,precip,rh_max,rh_min,td_avg,td_max,...,winds_max,netrad_avg,fan_rmp_avg,visibilitymiles,batt_volt_min,surfacet1_avg,surfacet2_avg,visibilitymiles_avg,pressurekpasealevel,Generated
13,0.185567,1.0,0.970588,0.950452,0.428716,0.0,0.204082,0.154639,0.638669,0.710567,...,0.377944,0.901472,0.883684,0.637497,0.846829,0.908184,0.507495,0.943724,0.984481,0.739523
14,0.175258,1.0,1.0,0.882289,0.427199,0.0,0.204082,0.14433,0.635499,0.696784,...,0.496075,0.812624,0.881323,1.0,0.84413,0.88523,0.507495,0.9656,0.984481,0.978212
16,0.164948,0.758621,0.764706,0.602792,0.395349,0.0,0.193878,0.134021,0.603803,0.679939,...,0.354324,0.490651,0.879551,1.0,0.845479,0.849301,0.47591,0.984359,0.982541,0.786369
17,0.237113,0.586207,0.588235,0.419929,0.417594,0.0,0.326531,0.14433,0.627575,0.854518,...,0.32282,0.310436,0.865774,1.0,0.825236,0.833333,0.418094,0.956246,0.982541,0.621222
18,0.185567,0.37931,0.382353,0.226663,0.433266,0.0,0.22449,0.14433,0.641838,0.721286,...,0.1496,0.245989,0.854163,0.987477,0.81444,0.773453,0.376338,0.9656,0.982541,0.416924


In [93]:
truth = mainDF['Generated']
mainDF.drop('Generated', axis=1, inplace=True)
mainDF.dropna(axis=1, inplace=True)

In [66]:
dataVersion1 = mainDF.to_numpy()
truthVersion1 = truth.to_numpy()

In [67]:

data_filename = '/Users/brad/Desktop/CS6620/Project/Data/dataVersion1.npy' 
truth_filename = '/Users/brad/Desktop/CS6620/Project/Data/truthVersion1.npy' 
np.save(data_filename, dataVersion1)
np.save(truth_filename, truthVersion1)