# MLFLOW setup in Jupyter Notebook

This notebook contains a small Data Science Project where MLFlow is used to log all Feature Engineering and Modeling Parameters as well as Metrics.



## Loading Data

In [1]:
import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")
import pandas as pd
import numpy as np
import warnings
import mlflow
import matplotlib.pyplot as plt
import rasterio
import geopandas as geopd
import rasterio.rio
import seaborn as sns
import datetime as dt 

from rasterio.plot import show

import pyreadr



In [2]:
result = pyreadr.read_r('../data/r_files/track_all.RDS') 
result_resamp = pyreadr.read_r('../data/r_files/track_resamp.RDS') 
#result_covar = pyreadr.read_r('../data/r_files/covariates.RDS') 

In [3]:
foxy_df = result[None]
foxy_df_resamp = result_resamp[None]
fox_metadata = pd.read_csv("../data/additional_info.csv", sep = "\t")

In [4]:
fox_metadata.head()

Unnamed: 0,individual-local-identifier,tag-local-identifier,den,sex,litter,active from,active till,GPS applied date,upload interval August,upload interval September
0,EEB1231F,2018-FSBD608-001,Girjas,F,,2018.07.10 01:56,2018.10.20 14:48,?,5h,5h
1,0BEC311C,2018-FSBD641_b-r/gr-b,Sadde,F,yes,2018.07.28 02:16,2018.09.28 14:47,2018.07.28,15 min,15 min
2,89B1231F,2018-FSBD615_v-gr/r-gr,Tj�gn�ris,F,yes,2018.07.26 06:43,2018.10.28 19:43,2019.07.26,15 min,15 min
3,B0B1231F,2018-FSBD619_r-gr/r-y,Dadtjatj�kk�,F,yes,2018.07.24 05:17,2018.11.07 17:10,2019.07.24,15 min,15 min
4,BBB0231F,2019-FSBD609-002,Smuole,F,yes,2019.07.22 02:45,2019.09.30 19:14,2019-07-22,15 min,15min -> 1h -> 4h


In [5]:
foxy_df_resamp

Unnamed: 0,id,sex,x_,y_,t_,burst_
0,2018-FSBD608-001,F,548522.717454,7.353167e+06,2018-07-10 01:56:30,1.0
1,2018-FSBD608-001,F,548525.680513,7.353168e+06,2018-07-10 06:56:32,2.0
2,2018-FSBD608-001,F,548539.963810,7.353164e+06,2018-07-10 11:56:32,3.0
3,2018-FSBD608-001,F,548195.507163,7.352493e+06,2018-07-10 22:31:11,4.0
4,2018-FSBD608-001,F,548558.891944,7.352589e+06,2018-07-11 13:31:01,5.0
...,...,...,...,...,...,...
15078,2019-FSBD641_y-gr/r-b,M,543632.466003,7.379183e+06,2019-12-14 08:10:19,49.0
15079,2019-FSBD641_y-gr/r-b,M,544607.109028,7.378558e+06,2019-12-14 12:10:15,50.0
15080,2019-FSBD641_y-gr/r-b,M,544147.656802,7.379631e+06,2019-12-14 16:10:13,51.0
15081,2019-FSBD641_y-gr/r-b,M,539474.780875,7.379241e+06,2019-12-15 00:10:14,52.0


### Loading in the raster layers for the features

In [6]:
elev = rasterio.open("../data/Rasters_for_R/DEM_30.tif")
aspect = rasterio.open("../data/Rasters_for_R/aspect_30.tif")
NDVI_NDMI = rasterio.open("../data/Rasters_for_R/NDVI_arj_vind.tif")
slope = rasterio.open("../data/Rasters_for_R/slope_arj_vind.tif")
veg = rasterio.open("../data/Rasters_for_R/veg_nofor_morecats.tif")
soil = rasterio.open("../data/Rasters_for_R/soil_av_clip.tif")
NDVI = rasterio.band(NDVI_NDMI, 3)
NDMI = rasterio.band(NDVI_NDMI, 2)


## Building and Annotating the data frames
### DF containing all points

In [7]:
gdf_all = geopd.GeoDataFrame(
    foxy_df, geometry=geopd.points_from_xy(foxy_df.x_, foxy_df.y_))

In [8]:
coord_list_all = [(x,y) for x,y in zip(gdf_all['geometry'].x , gdf_all['geometry'].y)]

In [9]:
gdf_all['NDVI_NDMI'] = [x for x in NDVI_NDMI.sample(coord_list_all)]
gdf_all["NDVI"] = [gdf_all.NDVI_NDMI[i][2] for i in range(0,gdf_all.shape[0])]
gdf_all["NDMI"] = [gdf_all.NDVI_NDMI[i][1] for i in range(0,gdf_all.shape[0])]
gdf_all['soil'] = [x[0] for x in soil.sample(coord_list_all)]
gdf_all['veg'] = [x[0] for x in veg.sample(coord_list_all)]
gdf_all['slope'] = [x[0] for x in slope.sample(coord_list_all)]
gdf_all['aspect'] = [x[0] for x in aspect.sample(coord_list_all)]
gdf_all['elev'] = [x[0] for x in elev.sample(coord_list_all)]
gdf_all.drop("NDVI_NDMI", inplace = True, axis = 1)


### DF containing resampled points
resampled over 2 hours

In [10]:
gdf_resamp = geopd.GeoDataFrame(
    foxy_df_resamp, geometry=geopd.points_from_xy(foxy_df_resamp.x_, foxy_df_resamp.y_))

In [11]:
coord_list_resamp = [(x,y) for x,y in zip(gdf_resamp['geometry'].x , gdf_resamp['geometry'].y)]

In [12]:
gdf_resamp['NDVI_NDMI'] = [x for x in NDVI_NDMI.sample(coord_list_resamp)]
gdf_resamp["NDVI"] = [gdf_resamp.NDVI_NDMI[i][2] for i in range(0,gdf_resamp.shape[0])]
gdf_resamp["NDMI"] = [gdf_resamp.NDVI_NDMI[i][1] for i in range(0,gdf_resamp.shape[0])]
gdf_resamp['soil'] = [x[0] for x in soil.sample(coord_list_resamp)]
gdf_resamp['veg'] = [x[0] for x in veg.sample(coord_list_resamp)]
gdf_resamp['slope'] = [x[0] for x in slope.sample(coord_list_resamp)]
gdf_resamp['aspect'] = [x[0] for x in aspect.sample(coord_list_resamp)]
gdf_resamp['elev'] = [x[0] for x in elev.sample(coord_list_resamp)]
gdf_resamp.drop("NDVI_NDMI", inplace = True, axis=1)


### DF Sample point raster
first building a raster over the whole area
Then annotating the points

In [13]:
xy = np.mgrid[gdf_all.x_.min():gdf_all.x_.max():70, gdf_all.y_.min():gdf_all.y_.max():70].reshape(2,-1).T
xy = pd.DataFrame(xy, columns= ["x","y"])

In [14]:
len(xy)

856829

In [15]:
sample_points = geopd.GeoDataFrame(
    xy, geometry=geopd.points_from_xy(xy.x, xy.y))
coord_list_sample = [(x,y) for x,y in zip(sample_points['geometry'].x , sample_points['geometry'].y)]
sample_points['NDVI_NDMI'] = [x for x in NDVI_NDMI.sample(coord_list_sample)]

sample_points["NDVI"] = [sample_points.NDVI_NDMI[i][2] for i in range(0,sample_points.shape[0])]
sample_points["NDMI"] = [sample_points.NDVI_NDMI[i][1] for i in range(0,sample_points.shape[0])]
sample_points['soil'] = [x[0] for x in soil.sample(coord_list_sample)]
sample_points['veg'] = [x[0] for x in veg.sample(coord_list_sample)]
sample_points['slope'] = [x[0] for x in slope.sample(coord_list_sample)]
sample_points['aspect'] = [x[0] for x in aspect.sample(coord_list_sample)]
sample_points['elev'] = [x[0] for x in elev.sample(coord_list_sample)]
sample_points.drop("NDVI_NDMI", inplace = True, axis=1)


KeyboardInterrupt: 

# Dealing with NaNs

### Rename categorical variables

In [16]:
# renaming in sample points data frame
sample_points.soil = sample_points.soil.replace(1,"Moraine").replace(2,"Peat(Turf)").replace(3,"Roesberg").replace(4,"Rest").replace(5,"Stone").replace(6,"Water").replace(0, np.nan)
sample_points.veg = sample_points.veg.replace(1,"Water").replace(2,"Snow").replace(3,"Stone").replace(4,"Dry Shrub").replace(5,"Moist Shrub").replace(6,"Grassland").replace(7,"Bush").replace(8,"Bog").replace(0, np.nan)

# renaming in geo data frame with all fox points
gdf_all.soil = gdf_all.soil.replace(1,"Moraine").replace(2,"Peat(Turf)").replace(3,"Roesberg").replace(4,"Rest").replace(5,"Stone").replace(6,"Water").replace(0, np.nan)
gdf_all.veg = gdf_all.veg.replace(1,"Water").replace(2,"Snow").replace(3,"Stone").replace(4,"Dry Shrub").replace(5,"Moist Shrub").replace(6,"Grassland").replace(7,"Bush").replace(8,"Bog").replace(0, np.nan)

# renaming in geo data frame with resampled fox points
gdf_resamp.soil = gdf_resamp.soil.replace(1,"Moraine").replace(2,"Peat(Turf)").replace(3,"Roesberg").replace(4,"Rest").replace(5,"Stone").replace(6,"Water").replace(0, np.nan)
gdf_resamp.veg = gdf_resamp.veg.replace(1,"Water").replace(2,"Snow").replace(3,"Stone").replace(4,"Dry Shrub").replace(5,"Moist Shrub").replace(6,"Grassland").replace(7,"Bush").replace(8,"Bog").replace(0, np.nan)

### Set NaN values
Replace the numbers that represent NaN values in the columns


In [17]:
# replacing in sample points data frame
sample_points.aspect = sample_points.aspect.replace(-9999, np.nan)
sample_points.slope = sample_points.slope.replace(-9999, np.nan)
sample_points.elev = sample_points.elev.apply(lambda x : np.nan if x < -3.4e+38 else x)

# replacing in gdf all data frame
gdf_all.aspect = gdf_all.aspect.replace(-9999, np.nan)
gdf_all.slope = gdf_all.slope.replace(-9999, np.nan)
gdf_all.elev = gdf_all.elev.apply(lambda x : np.nan if x < -3.4e+38 else x)

# replacing in gdf resamp data frame
gdf_resamp.aspect = gdf_resamp.aspect.replace(-9999, np.nan)
gdf_resamp.slope = gdf_resamp.slope.replace(-9999, np.nan)
gdf_resamp.elev = gdf_resamp.elev.apply(lambda x : np.nan if x < -3.4e+38 else x)



### Vegetation
Missing values in vegetation are either forested regions, which are uninteresting for foxes, or parts of the map that are missing. These will therefore be dropped completely.

In [38]:
# Sample points
sample_points_clean = sample_points.dropna(subset = ["veg"]).reset_index(drop = True)

# gdf all
foxes_all_clean = gdf_all.dropna(subset = ["veg"]).reset_index(drop = True)

# gdf resampled
foxes_resamp_clean = gdf_resamp.dropna(subset = ["veg"]).reset_index(drop = True)


In [29]:
import missingno as msno
msno.matrix(sample_points_clean)

NameError: name 'sample_points_clean' is not defined

### **Filling NAs for the other columns**

NDVI
* NA values are snowy peaks. Fill with value beween 0.0 and 0.1? [Resource](https://www.usgs.gov/special-topics/remote-sensing-phenology/science/ndvi-foundation-remote-sensing-phenology#overview)

NDMI
* NA Values at the same points as in the NDVI. Interpolate from the closest value. How does this work for geometries? 

Aspect
* NaN values that are left after dropping the vegetation NaNs seem to be plateaus
* fill with -1 as new value

In [None]:
sample_points_clean.NDMI.fillna(-1, inplace=True)
sample_points_clean.NDVI.fillna(0, inplace=True)
sample_points_clean.aspect.fillna(-1, inplace=True)
sample_points_clean.dropna(inplace = True)

In [None]:
sample_points_clean.isna().sum()


x           0
y           0
geometry    0
NDVI        0
NDMI        0
soil        0
veg         0
slope       0
aspect      0
elev        0
dtype: int64

In [39]:
foxes_all_clean.NDMI.fillna(-1, inplace=True)
foxes_all_clean.NDVI.fillna(0, inplace=True)
foxes_all_clean.aspect.fillna(-1, inplace=True)
foxes_all_clean.dropna(inplace = True)

In [40]:
foxes_all_clean.isna().sum()

x_          0
y_          0
t_          0
id          0
sex         0
geometry    0
NDVI        0
NDMI        0
soil        0
veg         0
slope       0
aspect      0
elev        0
dtype: int64

In [32]:
foxes_resamp_clean.NDMI.fillna(-1, inplace=True)
foxes_resamp_clean.NDVI.fillna(0, inplace=True)
foxes_resamp_clean.aspect.fillna(-1, inplace=True)
foxes_resamp_clean.dropna(inplace = True)

In [33]:
foxes_resamp_clean.isna().sum()

id          0
sex         0
x_          0
y_          0
t_          0
burst_      0
geometry    0
NDVI        0
NDMI        0
soil        0
veg         0
slope       0
aspect      0
elev        0
dtype: int64

In [41]:
foxes_all_clean.head()

Unnamed: 0,x_,y_,t_,id,sex,geometry,NDVI,NDMI,soil,veg,slope,aspect,elev
0,548522.717454,7353167.0,2018-07-10 01:56:30,2018-FSBD608-001,F,POINT (548522.717 7353166.639),0.662722,0.125608,Moraine,Dry Shrub,10.135784,205.492111,1033.016602
1,548525.680513,7353168.0,2018-07-10 06:56:32,2018-FSBD608-001,F,POINT (548525.681 7353168.363),0.662722,0.125608,Moraine,Dry Shrub,10.135784,205.492111,1033.016602
2,548539.96381,7353164.0,2018-07-10 11:56:32,2018-FSBD608-001,F,POINT (548539.964 7353164.150),0.662722,0.125608,Moraine,Dry Shrub,10.135784,205.492111,1033.016602
3,548195.507163,7352493.0,2018-07-10 22:31:11,2018-FSBD608-001,F,POINT (548195.507 7352493.197),0.443307,-0.138425,Stone,Grassland,10.065748,339.629059,1072.429321
4,548558.891944,7352589.0,2018-07-11 13:31:01,2018-FSBD608-001,F,POINT (548558.892 7352588.648),0.50428,-0.113833,Stone,Grassland,12.232175,6.898972,1056.9552


In [42]:
foxes_all_final = foxes_all_clean.copy()
foxes_all_final["t_"] = foxes_all_final["t_"].dt.strftime("%Y-%m-%d-%H:%M:%S")
foxes_all_final.to_file("../data/cleaned_shapefiles/foxes_all.shp")

In [27]:
gdf_all.head()

Unnamed: 0,x_,y_,t_,id,sex,geometry,NDVI,NDMI,soil,veg,slope,aspect,elev
0,548522.717454,7353167.0,2018-07-10 01:56:30,2018-FSBD608-001,F,POINT (548522.717 7353166.639),0.662722,0.125608,Moraine,Dry Shrub,10.135784,205.492111,1033.016602
1,548525.680513,7353168.0,2018-07-10 06:56:32,2018-FSBD608-001,F,POINT (548525.681 7353168.363),0.662722,0.125608,Moraine,Dry Shrub,10.135784,205.492111,1033.016602
2,548539.96381,7353164.0,2018-07-10 11:56:32,2018-FSBD608-001,F,POINT (548539.964 7353164.150),0.662722,0.125608,Moraine,Dry Shrub,10.135784,205.492111,1033.016602
3,548195.507163,7352493.0,2018-07-10 22:31:11,2018-FSBD608-001,F,POINT (548195.507 7352493.197),0.443307,-0.138425,Stone,Grassland,10.065748,339.629059,1072.429321
4,548558.891944,7352589.0,2018-07-11 13:31:01,2018-FSBD608-001,F,POINT (548558.892 7352588.648),0.50428,-0.113833,Stone,Grassland,12.232175,6.898972,1056.9552


In [36]:
foxes_resamp_final = foxes_resamp_clean.copy()
foxes_resamp_final["t_"] = foxes_resamp_final["t_"].dt.strftime("%Y-%m-%d-%H:%M:%S")
foxes_resamp_final.to_file("../data/cleaned_shapefiles/foxes_resamp.shp")

In [90]:
sample_points_clean.to_file("../data/cleaned_shapefiles/sample_points.shp")