In [1]:
import sys
sys.path.append('../mlai_research/')
import log
import utils
import geopandas as gpd
from shapely import wkb
import pandas as pd

In [2]:
logger = log.get_logger(__name__)
conf = utils.load_config("base")

09-Dec-23 22:37:12 - INFO - Starting 'load_config'.
09-Dec-23 22:37:12 - INFO - Finished 'load_config' in 0.0319 secs.


In [3]:
def check_crs(gdf1, gdf2) -> bool: 
    if gdf1.crs == gdf2.crs:
        return True
    else:
        print("Error: The CRS for the shape files does not match.")
        return False
    
    
def load_shp_data(dir_path, filename1, filename2):
    gdf1 = gpd.read_file(f"{dir_path}{filename1}") # import shapefile using geopandas
    logger.info(f"{filename1} Shape: {gdf1.shape}")
    gdf2 = gpd.read_file(f"{dir_path}{filename2}") # import shapefile using geopandas
    logger.info(f"{filename2} Shape: {gdf2.shape}")
    return gdf1, gdf2


def select_subset_cols(df, subset_cols):
    return df[subset_cols]


def geom_drop_z_dim(df):
    _drop_z = lambda geom: wkb.loads(wkb.dumps(geom, output_dimension=2))
    df.geometry = df.geometry.transform(_drop_z)
    return df


def process_shp_files(gdf1, gdf2):
    fil_gdf1 = select_subset_cols(gdf1, subset_cols=['Species', 'geometry'])
    fil_gdf1 = geom_drop_z_dim(fil_gdf1)
    fil_gdf2 = select_subset_cols(gdf2, subset_cols=['tag', 'geometry'])
    fil_gdf2 = fil_gdf2.rename(columns={'tag': 'Species'})
    return fil_gdf1, fil_gdf2


def filter_relevant_species(df):
    # logger.info(f"Species value counts (original label): {df['Species'].value_counts()}")
    species_map = {"Xanthium strumarium" : "Xanthium",
              "Datura stramonium": "Datura",
               "Xanthium": "Xanthium",
               "Datura": "Datura"}
    df['Species'] = df['Species'].map(species_map).fillna('Other')
    logger.info(f'Filtered Species value counts: {df.Species.value_counts()}')
    return df


def sync_crs(gdf, rasterimg) -> bool: 
    if gdf.crs != rasterimg.crs:
        gdf = gdf.set_crs(str(rasterimg.crs))
    return gdf

In [4]:
gdf1, gdf2 = load_shp_data(f"{conf.data.path_raw}classification_points/", conf.data.fn_shp_raw1, conf.data.fn_shp_raw2)
if check_crs(gdf1, gdf2):
    fil_gdf1, fil_gdf2 = process_shp_files(gdf1, gdf2)
    comb_gdf = pd.concat([fil_gdf1, fil_gdf2]).reset_index(drop=True)
    logger.info(f"Combined Shape: {comb_gdf.shape}")
    fil_comb_gdf = filter_relevant_species(comb_gdf)
    fil_comb_gdf['pid'] = list(range(len(fil_comb_gdf)))
    fil_comb_gdf.to_file(f"{conf.data.path_base_points}{conf.data.fn_shp_combined}", driver='ESRI Shapefile')

09-Dec-23 22:37:13 - INFO - letaba_invasives_june21.shp Shape: (153, 14)
09-Dec-23 22:37:13 - INFO - classification_points_wynand.shp Shape: (153, 3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
09-Dec-23 22:37:13 - INFO - Combined Shape: (306, 2)
09-Dec-23 22:37:13 - INFO - Filtered Species value counts: Species
Xanthium    142
Other        88
Datura       76
Name: count, dtype: int64


In [5]:
gdf = gpd.read_file(f"../data/01_base/points/letaba_classification_points.shp") 

In [6]:
gdf.head()

Unnamed: 0,Species,pid,geometry
0,Other,0,POINT (340729.445 7371235.519)
1,Other,1,POINT (340722.163 7371248.913)
2,Other,2,POINT (340713.499 7371248.818)
3,Xanthium,3,POINT (340717.911 7371249.236)
4,Other,4,POINT (340701.246 7371250.714)


In [7]:
gdf['buffer'] = gdf.buffer(5)

In [8]:
gdf.head()

Unnamed: 0,Species,pid,geometry,buffer
0,Other,0,POINT (340729.445 7371235.519),"POLYGON ((340734.445 7371235.519, 340734.421 7..."
1,Other,1,POINT (340722.163 7371248.913),"POLYGON ((340727.163 7371248.913, 340727.138 7..."
2,Other,2,POINT (340713.499 7371248.818),"POLYGON ((340718.499 7371248.818, 340718.475 7..."
3,Xanthium,3,POINT (340717.911 7371249.236),"POLYGON ((340722.911 7371249.236, 340722.887 7..."
4,Other,4,POINT (340701.246 7371250.714),"POLYGON ((340706.246 7371250.714, 340706.222 7..."


In [9]:
out = "../data/02_intermediate/points/letaba_classification_points.shp"

In [10]:
# gdf['buffer_wkt'] = gdf['buffer'].to_wkt()

In [11]:
gdf.head()

Unnamed: 0,Species,pid,geometry,buffer
0,Other,0,POINT (340729.445 7371235.519),"POLYGON ((340734.445 7371235.519, 340734.421 7..."
1,Other,1,POINT (340722.163 7371248.913),"POLYGON ((340727.163 7371248.913, 340727.138 7..."
2,Other,2,POINT (340713.499 7371248.818),"POLYGON ((340718.499 7371248.818, 340718.475 7..."
3,Xanthium,3,POINT (340717.911 7371249.236),"POLYGON ((340722.911 7371249.236, 340722.887 7..."
4,Other,4,POINT (340701.246 7371250.714),"POLYGON ((340706.246 7371250.714, 340706.222 7..."


In [12]:
gdf = gdf.drop('geometry', axis=1).rename(columns={"buffer": "geometry"})#.to_file(out, driver='ESRI Shapefile')

In [13]:
gdf.head()

Unnamed: 0,Species,pid,geometry
0,Other,0,"POLYGON ((340734.445 7371235.519, 340734.421 7..."
1,Other,1,"POLYGON ((340727.163 7371248.913, 340727.138 7..."
2,Other,2,"POLYGON ((340718.499 7371248.818, 340718.475 7..."
3,Xanthium,3,"POLYGON ((340722.911 7371249.236, 340722.887 7..."
4,Other,4,"POLYGON ((340706.246 7371250.714, 340706.222 7..."


In [None]:
gdf.to_file(out, driver='ESRI Shapefile')

In [12]:
gdf.to_file(out, driver='ESRI Shapefile', geometry='point')

ValueError: GeoDataFrame contains multiple geometry columns but GeoDataFrame.to_file supports only a single geometry column. Use a GeoDataFrame.to_parquet or GeoDataFrame.to_feather, drop additional geometry columns or convert them to a supported format like a well-known text (WKT) using `GeoSeries.to_wkt()`.