In [1]:
import os

import pandas as pd
import numpy as np
import geopandas as gpd

from shapely.geometry import Point
from shapely.wkt import loads

%matplotlib inline
pd.options.display.max_columns = 999

In [2]:
latlon = {'init': 'epsg:4326'}
utm_18 = {'init': 'epsg:32618'}

## Import Ground Truth and samples

In [3]:
gt_all = pd.read_excel('/Users/arredond/Downloads/GT_Depuration_v1.xlsx')

Let's rename the common fields to make this easier

In [4]:
gt_all.rename(index=str, columns={
    "Nombre d'étage": 'stories',
    "Superficie approximative": 'area'
}, inplace=True)
gt_all.columns = gt_all.columns.map(str.lower)

## Types

| **MBT short** | **MBT full** | **Structure material**               | **Wall material**                                           |
| ------------- | ------------ | ------------------------------------ | ----------------------------------------------------------- |
| MandW         | CM-UM        | Murs porteurs               | Blocs OR briques OR matonnerie de roches OR bois&matonnerie |
| RC            | RC-CB        | Béton OR béton armé                  | Blocs                                       |
| RC            | RC-SW        | Murs porteurs OR béton OR béton armé | Béton armé                                                  |
| RC            | RC-UM        | Béton armé                           | Briques OR matonnerie de roches OR bois&matonnerie            |
| RC            | RL-BM        | Murs porteurs                        | Blocs armés                                                 |
| MandW         | W-UM         | Bois&Tole                            | Blocs OR briques OR matonnerie de roches OR bois&matonnerie |

In [5]:
types_dict = {
    'W-UM': {
        'idx': (gt_all['structure'] == 'Structure en bois et en tole') & gt_all['murs'].isin([
                    'Murs de blocs non armés',
                    'Murs de briques',
                    'Maτonnerie de roches',
                    'Bois + Maτonnerie'
                ]),
        'ts': 'MAndW'
    },
    'RL-BM': {
        'idx': (gt_all['structure'] == 'Murs porteurs') & (gt_all['murs'] == 'Murs de blocs armés'),
        'ts': 'RC'
    },
    'RC-UM': {
        'idx': (gt_all['structure'] == 'Structure en béton armé') & gt_all['murs'].isin([
            'Murs de briques',
            'Maτonnerie de roches',
            'Bois + Maτonnerie'
        ]),
        'ts': 'RC'
    },
    'RC-SW': {
        'idx': (gt_all['structure'].isin([
            'Murs porteurs',
            'Structure en béton',
            'Structure en béton armé'
        ])) & (gt_all['murs'] == 'Murs en béton armé'),
        'ts': 'RC'
    },
    'RC-CB': {
        'idx': (gt_all['structure'].isin([
            'Structure en béton',
            'Structure en béton armé'
        ])) & (gt_all['murs'].isin([
            'Murs de blocs non armés'
        ])),
        'ts': 'RC'
    },
    'CM-UM': {
        'idx': (gt_all['structure'].isin([
            'Murs porteurs'
        ])) & (gt_all['murs'].isin([
            'Murs de blocs non armés',
            'Murs de briques',
            'Maτonnerie de roches',
            'Bois + Maτonnerie'
        ])),
        'ts': 'MAndW'
    }
}

#### Add MBTs

In [6]:
gt_all['type_short'] = None
gt_all['type_full'] = None
for k,v in types_dict.items():
    gt_all.loc[v['idx'], 'type_full'] = k
    gt_all.loc[v['idx'], 'type_short'] = v['ts']

## Clean data

- Remove records that aren't `residential`
- Remove records with 100% damage
- Remove records with empty cells

In [7]:
gt_all = gt_all.loc[
    (gt_all['residentiel- section unique'] == 'X') |
    (gt_all['residentiel- section multiple'] == 'X')
].drop(axis=1, columns=['residentiel- section unique', 'residentiel- section multiple'])

gt_all = gt_all.loc[~(gt_all['dommage estimé'] == '100%')]

In [8]:
vital_cols = [
    "stories", "area", "toiture", "structure", "murs",
    "quartier", "type_full", "type_short"
]
gt_all.dropna(axis=0, subset=vital_cols, inplace=True)

- Filter area, keeping only 0 <= area <= 250  (sqm)
- Filter height, keeping only 0 <= height <= 25  (m)

In [9]:
filter_conditions = [
    'area >= 10',
    'area <= 250',
    'stories >= 0',
    'stories <= 8'
]

gt_all = gt_all.query(' & '.join(filter_conditions))

* Eliminate records where RoofType = RC and type_short = MandW
* Eliminate records where RoofType = Tin and area < 20 and Stories < 1 and type_short = RC
* Eliminate records where RoofType = Tin and area > 200 and Stories >= 2 and type_short = MandW

In [10]:
gt_all['roof_type'] = gt_all['toiture'].apply(lambda x: 'tin' if 'tole' in x.lower() else 'rc')

In [11]:
q = "\
    (roof_type != 'rc' | type_short != 'MAndW') & \
    (roof_type != 'tin' | type_short != 'RC' | area >= 20 | stories >= 1) & \
    (roof_type != 'tin' | type_short != 'MAndW' | area <= 200)"
gt_all = gt_all.query(q)

Convert to GeoDataFrame in order to get the urban pattern for each point

In [12]:
gt_all['geometry'] = gt_all.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
gt_all = gpd.GeoDataFrame(gt_all.copy())
gt_all.crs = latlon

We'll convert the coordinates in UTM Zone 18N for model and visualization purposes

In [13]:
gt_all = gt_all.to_crs(utm_18)
gt_all['utm_x'] = gt_all.geometry.x
gt_all['utm_y'] = gt_all.geometry.y

Load urban patterns and intersect to add to GT

In [14]:
urban_patterns = gpd.read_file("../data/urban_patterns/UrbanPattern_PauP_Complete.shp")
urban_patterns.rename(index=str, columns={'UrbanPatte': 'pattern'}, inplace=True)
urban_patterns.replace('Informal', 'Shanty', inplace=True)
urban_patterns = urban_patterns.to_crs(gt_all.crs)

gt_sjoin = gpd.sjoin(gt_all, urban_patterns[['geometry', 'pattern']], op='within')

# Keep only the Urban Patterns we're interested in
gt_sjoin = gt_sjoin.loc[~gt_sjoin['pattern'].isin(['No', 'Industrial'])]

We now have a clean-but-complete Ground Truth.
Let's save this as vector and CSV before proceding
to reduce the number of fields and rows.

In [15]:
gt_sjoin.to_csv("../data/ground_truth/ground_truth_clean.csv", index=False)
gt_sjoin.to_file("../data/ground_truth/ground_truth_clean.gpkg", driver='GPKG')

## Keeping only the necessary data

First of all, let's just keep the columns we _really_ want

In [16]:
min_cols = [
    'stories', 'area', 'roof_type', 'quartier',
    'type_full', 'type_short', 'pattern', 'geometry',
    'latitude', 'longitude', 'utm_x', 'utm_y'
]
gt = gt_sjoin.loc[:, min_cols]

Next, we'll remove all GT points inside samples

In [24]:
# Load samples
samples = pd.read_csv("../data/samples/samples_reference.csv")
samples['geometry'] = samples['geometry_wkt'].apply(lambda x: loads(x))
samples = gpd.GeoDataFrame(samples, crs=latlon)
samples = samples.to_crs(crs=utm_18)
samples = samples.loc[~samples['FID'].isin([10, 17, 22])]
samples['Class'] = samples['Class'].str.lower()
samples = samples[['FID', 'Class', 'geometry']]

In [39]:
intersect = gpd.sjoin(gt, samples, op='within').set_index('index_right')

# Keep samples within for model classification
os.makedirs('../data/ground_truth/within_samples', exist_ok=True)
for stratum in intersect['pattern'].unique():
    stratum_within = intersect.loc[intersect['pattern'] == stratum]
    print(stratum, stratum_within.shape)
    stratum_within.to_file(f'../data/ground_truth/within_samples/gt_within_{stratum.lower()}')

UrbanIrreg (752, 14)
UrbanReg (679, 14)
Shanty (952, 14)
Rural (67, 14)
Residential (116, 14)


In [None]:
# Use samples outside for model training
gt = gt.loc[np.invert(gt.index.isin(intersect.index)), :].copy()
gt.reset_index(inplace=True)
gt.drop(axis=1, columns=['geometry', 'index'], inplace=True)

In [None]:
patterns = ['UrbanReg', 'UrbanIrreg', 'Shanty', 'Residential', 'Rural']
selected = {}

for p in patterns:
    gt_pattern = gt.loc[gt['pattern'] == p, :]
    print(f'{p}: {len(gt_pattern)} (Total) - {len(gt_pattern.loc[gt_pattern["type_short"] == "MAndW"])} (MAndW) - {len(gt_pattern.loc[gt_pattern["type_short"] == "RC"])} (RC)')

### Balanced classes

Let's keep a testing/training set for each urban pattern.
We'll keep the classes balanced to help the latter models.

In [20]:
os.makedirs('../data/ground_truth/outside_samples', exist_ok=True)

patterns = ['UrbanReg', 'UrbanIrreg', 'Shanty', 'Residential', 'Rural']
selected = {}

for p in patterns:
    if p == 'Rural':
        n, n_train, n_test = 150, 120, 30
    elif p == 'Residential':
        n, n_train, n_test = 300, 240, 60
    elif p == 'UrbanReg':
        n, n_train, n_test = 2000, 1600, 400
    else:
        n, n_train, n_test = 4000, 3200, 800

    gtp = gt.loc[gt['pattern'] == p, :]
    
    # Save without sampling (unbalanced)
    gtp_train_unb = gtp.copy().sample(round(gtp.shape[0]*0.8))
    gtp_test_unb = gtp.copy().loc[~gtp.index.isin(gtp_train_unb.index), :]
    
    # Sample balanced
    gtp_mw = gtp.loc[gt['type_short'] == 'MAndW', :].sample(n)
    gtp_rc = gtp.loc[gt['type_short'] == 'RC', :].sample(n)
    
    selected[p] = {
        'train': pd.concat([
            gtp_mw.head(n_train),
            gtp_rc.head(n_train)
        ], ignore_index=True),
        'test': pd.concat([
            gtp_mw.tail(n_test),
            gtp_rc.tail(n_test)
        ], ignore_index=True),
        'train_unbalanced': gtp_train_unb,
        'test_unbalanced': gtp_test_unb
    }
    
    
    for k,v in selected[p].items():
        v.to_csv(f'../data/ground_truth/outside_samples/gt_{p.lower()}_{k}.csv', index=False)

### All together now

We'll also keep a copy of all the data together, in order to test the models
in other, exciting ways.

In [21]:
test_train_data = {
    'test': pd.concat([selected[x]['test'] for x in selected.keys()]),
    'train': pd.concat([selected[x]['train'] for x in selected.keys()])
}

In [22]:
for k,v in test_train_data.items():
    v.to_csv(f'../data/ground_truth/gt_{k}_all.csv', index=False)