In [1]:
import pandas as pd
import numpy as np

import geopandas as gpd

import seaborn as sns
import matplotlib.pyplot as plt

import os,glob

In [2]:
variable_names = pd.read_csv('../Data/variable_definitions.csv')
variable_names

Unnamed: 0,Variable,Description
0,ID,The IDs take the form of [area ID]_yyyy-mm-dd....
1,area,Area ID
2,date,The date that the data is aggregated over
3,lat,Latitude of the center of the area
4,lon,Longitude of the center of the area
5,burn_area,Percentage of the area burnt
6,climate_aet,"Actual evapotranspiration, derived using a one..."
7,climate_def,"Climate water deficit, derived using a one-dim..."
8,climate_pdsi,Palmer Drought Severity Index
9,climate_pet,Reference evapotranspiration (ASCE Penman-Mont...


In [3]:
df = pd.read_csv('../Data/Train.csv')
df.head()

Unnamed: 0,ID,lat,lon,burn_area,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,...,landcover_0,landcover_1,landcover_2,landcover_3,landcover_4,landcover_5,landcover_6,landcover_7,landcover_8,precipitation
0,0_2001-01-01,-15.858835,29.237029,0.0,1195,0,263,1195,206,10,...,0.0,0.0,0.018654,0.0,0.714446,0.012174,0.24489,0.009836,0.0,0.256932
1,1_2001-01-01,-15.858835,29.487029,0.0,1196,0,232,1196,201,10,...,0.0,0.0,0.0,0.0,0.654783,9.5e-05,0.345121,0.0,0.0,0.273093
2,2_2001-01-01,-15.858835,29.737029,0.0,1190,0,314,1190,192,10,...,0.0,0.0,0.0,0.0,0.516421,0.0,0.483579,0.0,0.0,0.285109
3,3_2001-01-01,-15.858835,29.987029,0.0,1144,0,321,1144,186,66,...,0.0,0.0,0.0,0.0,0.299,0.163902,0.537098,0.0,0.0,0.298418
4,4_2001-01-01,-15.858835,30.237029,0.0,1187,0,413,1187,186,9,...,0.0,0.0,0.0,0.0,0.277392,0.067742,0.654866,0.0,0.0,0.315621


In [4]:
df.columns

Index(['ID', 'lat', 'lon', 'burn_area', 'climate_aet', 'climate_def',
       'climate_pdsi', 'climate_pet', 'climate_pr', 'climate_ro',
       'climate_soil', 'climate_srad', 'climate_swe', 'climate_tmmn',
       'climate_tmmx', 'climate_vap', 'climate_vpd', 'climate_vs', 'elevation',
       'landcover_0', 'landcover_1', 'landcover_2', 'landcover_3',
       'landcover_4', 'landcover_5', 'landcover_6', 'landcover_7',
       'landcover_8', 'precipitation'],
      dtype='object')

In [5]:
df.shape

(83148, 29)

In [6]:
unique_locations = df.groupby(['lat', 'lon']).size().reset_index(name='count')
print(f'There are {unique_locations['count'][0]} measurments for each of the {unique_locations.shape[0]} areas in Zimbabwe')

There are 156 measurments for each of the 533 areas in Zimbabwe


In [7]:
gdf = gpd.GeoDataFrame(unique_locations, 
                       geometry=gpd.points_from_xy(unique_locations.lon, unique_locations.lat),
                       crs=4326)
gdf.head()

Unnamed: 0,lat,lon,count,geometry
0,-22.358835,31.237029,156,POINT (31.23703 -22.35883)
1,-22.108835,29.487029,156,POINT (29.48703 -22.10883)
2,-22.108835,29.737029,156,POINT (29.73703 -22.10883)
3,-22.108835,29.987029,156,POINT (29.98703 -22.10883)
4,-22.108835,30.237029,156,POINT (30.23703 -22.10883)


In [8]:
gdf.to_file('../Data/results/selected_location_for_train.geojson', driver='GeoJSON')

In [9]:
df['date'] = pd.to_datetime(df['ID'].str.split('_', expand=True)[1])

In [10]:
df.iloc[:,-3:].head()

Unnamed: 0,landcover_8,precipitation,date
0,0.0,0.256932,2001-01-01
1,0.0,0.273093,2001-01-01
2,0.0,0.285109,2001-01-01
3,0.0,0.298418,2001-01-01
4,0.0,0.315621,2001-01-01


In [11]:
df.drop(columns=['ID'], inplace=True)

In [12]:
df.columns

Index(['lat', 'lon', 'burn_area', 'climate_aet', 'climate_def', 'climate_pdsi',
       'climate_pet', 'climate_pr', 'climate_ro', 'climate_soil',
       'climate_srad', 'climate_swe', 'climate_tmmn', 'climate_tmmx',
       'climate_vap', 'climate_vpd', 'climate_vs', 'elevation', 'landcover_0',
       'landcover_1', 'landcover_2', 'landcover_3', 'landcover_4',
       'landcover_5', 'landcover_6', 'landcover_7', 'landcover_8',
       'precipitation', 'date'],
      dtype='object')

In [14]:
df.to_csv('../Data/results/Train[updated].csv',index=False)