In [5]:
import sys
sys.path.append('../mlai_research/')
import log
import utils
import geopandas as gpd
from shapely import wkb
import pandas as pd

In [6]:
logger = log.get_logger(__name__)
conf = utils.load_config("base")

16-Nov-23 00:41:49 - INFO - Starting 'load_config'.
16-Nov-23 00:41:49 - INFO - Finished 'load_config' in 0.0249 secs.


In [7]:
def check_crs(gdf1, gdf2) -> bool: 
    if gdf1.crs == gdf2.crs:
        return True
    else:
        print("Error: The CRS for the shape files does not match.")
        return False
    
    
def load_shp_data(dir_path, filename1, filename2):
    gdf1 = gpd.read_file(f"{dir_path}{filename1}") # import shapefile using geopandas
    logger.info(f"{filename1} Shape: {gdf1.shape}")
    gdf2 = gpd.read_file(f"{dir_path}{filename2}") # import shapefile using geopandas
    logger.info(f"{filename2} Shape: {gdf2.shape}")
    return gdf1, gdf2


def select_subset_cols(df, subset_cols):
    return df[subset_cols]


def geom_drop_z_dim(df):
    _drop_z = lambda geom: wkb.loads(wkb.dumps(geom, output_dimension=2))
    df.geometry = df.geometry.transform(_drop_z)
    return df


def process_shp_files(gdf1, gdf2):
    fil_gdf1 = select_subset_cols(gdf1, subset_cols=['Species', 'geometry'])
    fil_gdf1 = geom_drop_z_dim(fil_gdf1)
    fil_gdf2 = select_subset_cols(gdf2, subset_cols=['tag', 'geometry'])
    fil_gdf2 = fil_gdf2.rename(columns={'tag': 'Species'})
    return fil_gdf1, fil_gdf2


def filter_relevant_species(df):
    # logger.info(f"Species value counts (original label): {df['Species'].value_counts()}")
    species_map = {"Xanthium strumarium" : "Xanthium",
              "Datura stramonium": "Datura",
               "Xanthium": "Xanthium",
               "Datura": "Datura"}
    df['Species'] = df['Species'].map(species_map).fillna('Other')
    return df


def sync_crs(gdf, rasterimg) -> bool: 
    if gdf.crs != rasterimg.crs:
        gdf = gdf.set_crs(str(rasterimg.crs))
    return gdf

In [8]:
gdf1, gdf2 = load_shp_data(f"{conf.data.path_raw}classification_points/", conf.data.fn_shp_raw1, conf.data.fn_shp_raw2)
if check_crs(gdf1, gdf2):
    fil_gdf1, fil_gdf2 = process_shp_files(gdf1, gdf2)
    comb_gdf = pd.concat([fil_gdf1, fil_gdf2]).reset_index(drop=True)
    fil_comb_gdf = filter_relevant_species(comb_gdf)
    fil_comb_gdf['pid'] = list(range(len(fil_comb_gdf)))
    fil_comb_gdf.to_file(f"{conf.data.path_base_points}{conf.data.fn_shp_combined}", driver='ESRI Shapefile')

16-Nov-23 00:41:49 - INFO - letaba_invasives_june21.shp Shape: (153, 14)
16-Nov-23 00:41:49 - INFO - letaba_invasives_june21.shp Shape: (153, 14)
16-Nov-23 00:41:49 - INFO - classification_points_wynand.shp Shape: (153, 3)
16-Nov-23 00:41:49 - INFO - classification_points_wynand.shp Shape: (153, 3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [9]:
fil_comb_gdf.shape

(306, 2)

In [13]:
fil_comb_gdf.head()

Unnamed: 0,Species,geometry,pid
0,Other,POINT (340729.445 7371235.519),0
1,Other,POINT (340722.163 7371248.913),1
2,Other,POINT (340713.499 7371248.818),2
3,Xanthium,POINT (340717.911 7371249.236),3
4,Other,POINT (340701.246 7371250.714),4


In [11]:
fil_comb_gdf['Species'].value_counts()

Species
Xanthium    142
Other        88
Datura       76
Name: count, dtype: int64