## NA Exploration for Features
### - see na_explore.Rmd for first steps of summarizing NA feature data

In [1]:
import numpy as np
import pandas as pd

import os

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as colors

# from scipy.interpolate import interp1d

#import missingno as msno # doesn't work on Taylor

In [2]:
# retreive current working directory
os.getcwd()

'/Users/scognac/CropMosaiks/Modeling/na_summary'

In [3]:
# 4, 15, or 24
# define the number of points to import the feature file of interest
points = "15"
feature_file_name = (f'sentinel-2_ZMB_{points}'
                     f'k-points_1000-features')

### Append all 15k points feature data from 2016-2021 into a pandas dataframe

In [4]:
# define an empty dataframe for all feature data to be stored when exported from the following for loop
features = pd.DataFrame()

# open annual 15k point feather files in the features folder
for yr in range(2016, 2022):
    print("Opening:", yr)
    features_x = pd.read_feather(f"/capstone/cropmosaiks/data/features/{feature_file_name}_{yr}.feather")
    features_x = features_x.set_index(['lon','lat', 'year', 'month']).unstack()
    features_x.columns = features_x.columns.map(lambda x: '{}_{}'.format(*x))
    features_x = features_x.reset_index()
# drop latitiude and longitude columns
    features_x = features_x.drop(['lon', 'lat'], axis = 1)
# axis = 1 means col
    
# concatenate the feather files together, axis = 0 specifies to stack rows (rather than adding columns)
    features = pd.concat([features, features_x], axis=0)
    print("feature.shape", features.shape)
    print("Appending:", yr)
    print("")
features

Opening: 2016
feature.shape (15058, 12001)
Appending: 2016

Opening: 2017
feature.shape (30116, 12001)
Appending: 2017

Opening: 2018
feature.shape (45174, 12001)
Appending: 2018

Opening: 2019
feature.shape (60232, 12001)
Appending: 2019

Opening: 2020
feature.shape (75290, 12001)
Appending: 2020

Opening: 2021
feature.shape (90348, 12001)
Appending: 2021



Unnamed: 0,year,0_01,0_02,0_03,0_04,0_05,0_06,0_07,0_08,0_09,...,999_03,999_04,999_05,999_06,999_07,999_08,999_09,999_10,999_11,999_12
0,2016,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,1.0,1.000000,0.104104,0.074053,0.014657,0.012761,0.008725,,0.065386
1,2016,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,1.0,1.000000,0.001978,1.000000,0.000509,0.000323,0.000085,0.008823,
2,2016,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,1.0,1.000000,0.012835,0.013470,0.010998,0.011397,0.009623,,0.008626
3,2016,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,1.0,1.000000,0.010825,1.000000,0.009010,0.008285,0.008821,0.009061,
4,2016,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,1.0,1.000000,0.012540,0.011034,0.010122,0.010264,0.008985,,0.008561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15053,2021,,,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.745588,,0.720009,0.676316,0.567757,0.207773,0.157042,,0.137609,0.341566
15054,2021,,,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.702853,,0.618746,0.535060,0.393334,0.055867,0.031003,,0.229620,0.173497
15055,2021,,,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.697032,,0.586658,0.529629,0.415733,0.109411,0.079943,,0.064214,0.658739
15056,2021,,,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.710249,,0.635214,0.569514,0.424395,0.091725,0.093681,,0.062830,0.162939


### The annual files each have the same number of columns, because each files contains the same number of features over the same amount of time periods. As each file is appended, the number of rows increases by 15058 rows.

In [5]:
features_index = features.set_index(['year']).reset_index()
features_index

Unnamed: 0,year,0_01,0_02,0_03,0_04,0_05,0_06,0_07,0_08,0_09,...,999_03,999_04,999_05,999_06,999_07,999_08,999_09,999_10,999_11,999_12
0,2016,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,1.0,1.000000,0.104104,0.074053,0.014657,0.012761,0.008725,,0.065386
1,2016,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,1.0,1.000000,0.001978,1.000000,0.000509,0.000323,0.000085,0.008823,
2,2016,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,1.0,1.000000,0.012835,0.013470,0.010998,0.011397,0.009623,,0.008626
3,2016,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,1.0,1.000000,0.010825,1.000000,0.009010,0.008285,0.008821,0.009061,
4,2016,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,,1.0,1.000000,0.012540,0.011034,0.010122,0.010264,0.008985,,0.008561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90343,2021,,,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.745588,,0.720009,0.676316,0.567757,0.207773,0.157042,,0.137609,0.341566
90344,2021,,,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.702853,,0.618746,0.535060,0.393334,0.055867,0.031003,,0.229620,0.173497
90345,2021,,,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.697032,,0.586658,0.529629,0.415733,0.109411,0.079943,,0.064214,0.658739
90346,2021,,,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.710249,,0.635214,0.569514,0.424395,0.091725,0.093681,,0.062830,0.162939


In [None]:
# convert the dataframe to a csv and feather file
features_index.to_csv('/capstone/cropmosaiks/15k_data.csv')
features_index.to_feather('/capstone/cropmosaiks/15k_data.feather')

## Switch to R script