## PACKAGES

In [1]:
#pip install geopandas

In [2]:
import geopandas as gp

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Read the whole shapefile

In [3]:
pwd

'/Users/amelietatin/code/amelietatin/Predicting_land_cover/notebooks'

In [4]:
all_protected_areas = gp.read_file("../raw_data/all_protected_areas_shp/Natura2000_end2022_epsg3035.shp")

In [5]:
all_protected_areas.head(2)

Unnamed: 0,SITECODE,SITENAME,RELEASE_DA,MS,SITETYPE,INSPIRE_ID,area,geometry
0,BG0002104,Tsibarsko blato,2021-12-15,BG,A,,9097408.781,"POLYGON ((5400559.283 2397105.657, 5400557.363..."
1,CY3000006,THALASSIA PERIOCHI NISIA,2022-10-18,CY,B,,1917559.082,"POLYGON ((6501132.084 1672770.796, 6501815.998..."


In [6]:
all_protected_areas.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 27193 entries, 0 to 27192
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   SITECODE    27193 non-null  object  
 1   SITENAME    27193 non-null  object  
 2   RELEASE_DA  27193 non-null  object  
 3   MS          27193 non-null  object  
 4   SITETYPE    27193 non-null  object  
 5   INSPIRE_ID  3486 non-null   object  
 6   area        27193 non-null  float64 
 7   geometry    27193 non-null  geometry
dtypes: float64(1), geometry(1), object(6)
memory usage: 1.7+ MB


In [7]:
type(all_protected_areas)

geopandas.geodataframe.GeoDataFrame

In [8]:
all_protected_areas.SITECODE.nunique() # NUMBER OF UNIQUE PROTECTED AREAS IDs

27193

In [9]:
all_protected_areas.SITETYPE.unique() # DIFFERENT SITE TYPES

array(['A', 'B', 'C'], dtype=object)

# Bounding box (for CDS API) of all the protected areas

In [10]:
all_protected_areas["centroid_lon_lat"]=all_protected_areas["geometry"].centroid.to_crs(epsg=4326)
all_protected_areas["lon"] = all_protected_areas.centroid_lon_lat.x
all_protected_areas["lat"] = all_protected_areas.centroid_lon_lat.y

In [11]:
all_protected_areas.head(2)

Unnamed: 0,SITECODE,SITENAME,RELEASE_DA,MS,SITETYPE,INSPIRE_ID,area,geometry,centroid_lon_lat,lon,lat
0,BG0002104,Tsibarsko blato,2021-12-15,BG,A,,9097408.781,"POLYGON ((5400559.283 2397105.657, 5400557.363...",POINT (23.47291 43.82178),23.472909,43.821775
1,CY3000006,THALASSIA PERIOCHI NISIA,2022-10-18,CY,B,,1917559.082,"POLYGON ((6501132.084 1672770.796, 6501815.998...",POINT (34.07156 35.00162),34.071555,35.001619


In [12]:
all_protected_areas["lon"].min(), all_protected_areas["lon"].max(), all_protected_areas["lat"].min(), all_protected_areas["lat"].max()


(-32.30906837949666, 34.071555485388096, 27.658764598717514, 70.01981302035634)

# Select only the polygons

In [13]:
polygons_pa = all_protected_areas[all_protected_areas.geometry.type == 'Polygon']

In [14]:
type(polygons_pa)

geopandas.geodataframe.GeoDataFrame

In [15]:
polygons_pa.SITECODE.nunique()

19238

In [16]:
polygons_pa.SITETYPE.unique()

array(['A', 'B', 'C'], dtype=object)

# Select only type C

In [17]:
polygons_type_C_pa = polygons_pa[polygons_pa.SITETYPE == 'C']

In [18]:
type(polygons_type_C_pa)

geopandas.geodataframe.GeoDataFrame

In [19]:
polygons_type_C_pa.SITECODE.nunique()

1349

# Select on area size

In [20]:
all_protected_areas.area.describe()

count    2.719300e+04
mean     5.834623e+07
std      6.431153e+08
min      1.017281e+00
25%      4.346933e+05
50%      2.825788e+06
75%      1.822086e+07
max      7.196047e+10
dtype: float64

In [21]:
polygons_type_C_pa.area.describe()

count    1.349000e+03
mean     1.042316e+08
std      4.561199e+08
min      7.502846e+03
25%      2.448960e+06
50%      1.064924e+07
75%      4.639470e+07
max      1.051245e+10
dtype: float64

## Filter between 25% and 75%

In [22]:
filtered_area_pa = polygons_type_C_pa[(polygons_type_C_pa['area'] > 2.448960e+06) & (polygons_type_C_pa['area'] < 4.639470e+07)]

In [23]:
len(filtered_area_pa)

674

# Select random protected areas FLORI/ALI/TIM/AMELIE

In [24]:
# Calculate the number of records per group
num_groups = 4
records_per_group = len(filtered_area_pa) // num_groups
records_per_group

In [42]:
filtered_area_pa = filtered_area_pa[['SITECODE', 'SITENAME','geometry', 'lon', 'lat']]

In [43]:
# Split the GeoDataFrame into 4 equal groups
groups = np.array_split(filtered_area_pa, num_groups)

  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)


In [44]:
protected_areas_for_ali = groups[0]
protected_areas_for_tim = groups[1]
protected_areas_for_flori = groups[2]
protected_areas_for_amelie = groups[3]

In [45]:
#filtered_area_pa_50 = filtered_area_pa.sample(50, random_state=1) 
#filtered_area_pa_10 = filtered_area_pa.sample(10, random_state=1) 

#  Write new file

In [48]:
protected_areas_for_amelie.head(2)

Unnamed: 0,SITECODE,SITENAME,geometry,lon,lat
22655,SE0230125,Svartåmynningen,"POLYGON ((4644198.584 3942773.354, 4644646.309...",15.555642,58.464362
22679,SE0230157,Fjällmossen östgötadelen,"POLYGON ((4700535.729 3972878.312, 4700696.681...",16.530107,58.693415


In [47]:
len(protected_areas_for_ali), len(protected_areas_for_flori), len(protected_areas_for_amelie), len(protected_areas_for_tim)

(169, 168, 168, 169)

In [49]:
!pwd

/Users/amelietatin/code/amelietatin/Predicting_land_cover/notebooks


In [78]:
#final_df_50.to_file('../raw_data/sample_protected_areas/sample_filtered_protected_areas_50.shp')
#final_df_10.to_file('../raw_data/sample_protected_areas/sample_filtered_protected_areas_10.shp')

In [51]:
protected_areas_for_ali.to_file('../raw_data/protected_areas_to_share/ali/sample_protected_areas_for_ali.shp')
protected_areas_for_flori.to_file('../raw_data/protected_areas_to_share/flori/sample_protected_areas_for_flori.shp')
protected_areas_for_tim.to_file('../raw_data/protected_areas_to_share/tim/sample_protected_areas_for_tim.shp')
protected_areas_for_amelie.to_file('../raw_data/protected_areas_to_share/amelie/sample_protected_areas_for_amelie.shp')