In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.simplefilter('ignore')

# Create Meta Dataset

**Features:**


* **v1**
    - Area
    - Subregion (one-hot encoded)
    
* **v2**:
    - X, Y coordinates
    - 5 Nearest Neighbor features
        - `Area` - Area of neighbour
        - `Distance` - Distance to neighbour
        - `class count` - Count of surrounding classes
    ***v2.1**:
        - 10 nearest neighbour features
        - `nn_count` - Count of neighbours in 500m
    

## Set Dataset Version

In [9]:
dataset_version = 'v2.1'

### Load Data

In [10]:
import numpy as np
import pandas as pd

from src.utils import read_shapefile, safe_create_dir

train_shp = read_shapefile('train')
test_shp = read_shapefile('test')

# project to GMT for Lat Long coords
train_shp = train_shp.to_crs({'init': 'epsg:4326'})
test_shp = test_shp.to_crs({'init': 'epsg:4326'})

train_shp.head()

Unnamed: 0_level_0,Area,Subregion,Crop_Id_Ne,geometry,y
Field_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.915905,3,8,POLYGON ((22.16935378930472 -28.97707248953319...,Vineyard
2,2.06441,3,6,POLYGON ((22.17427414231474 -28.97676532206398...,Pecan
3,1.0803,3,8,POLYGON ((22.17341929705008 -28.97728122949355...,Vineyard
4,1.31619,3,8,POLYGON ((22.17588169099293 -28.97691670488457...,Vineyard
7,5.52922,3,8,"POLYGON ((22.1762250942478 -28.97449847161998,...",Vineyard


In [11]:
# Convert to Pandas
train_df = pd.DataFrame(train_shp[['Area','Subregion']])

train_labels = train_shp.y
labels_one_hot = pd.get_dummies(train_labels)

test_df = test_shp[['Area','Subregion']]

### One-hot Encode Subregion

In [12]:
from category_encoders import OneHotEncoder

encoder = OneHotEncoder(use_cat_names=True, return_df=True, cols=['Subregion'])

train_df = encoder.fit_transform(train_df)
test_df = encoder.transform(test_df)

train_df.head()

Unnamed: 0_level_0,Area,Subregion_3.0,Subregion_1.0,Subregion_4.0,Subregion_2.0,Subregion_5.0,Subregion_6.0,Subregion_0.0,Subregion_7.0
Field_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0.915905,1,0,0,0,0,0,0,0
2,2.06441,1,0,0,0,0,0,0,0
3,1.0803,1,0,0,0,0,0,0,0
4,1.31619,1,0,0,0,0,0,0,0
7,5.52922,1,0,0,0,0,0,0,0


## Nearest Neighbour Analysis

Find the 5 closest farms _in the training set_ and return the following information:

- distance to farm centroid
- farm class
- farm area
- farm subregion

In [13]:
from pysal.lib.cg import KDTree, RADIUS_EARTH_KM

In [14]:
# Add centroids
train_shp['centroid'] = train_shp.geometry.centroid
test_shp['centroid'] = test_shp.geometry.centroid

# Add centroid coords
train_df['lat'] = train_shp.centroid.x
train_df['lon'] = train_shp.centroid.y

test_df['lat'] = test_shp.centroid.x
test_df['lon'] = test_shp.centroid.y

train_df.head()

Unnamed: 0_level_0,Area,Subregion_3.0,Subregion_1.0,Subregion_4.0,Subregion_2.0,Subregion_5.0,Subregion_6.0,Subregion_0.0,Subregion_7.0,lat,lon
Field_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.915905,1,0,0,0,0,0,0,0,22.170005,-28.977219
2,2.06441,1,0,0,0,0,0,0,0,22.174372,-28.97595
3,1.0803,1,0,0,0,0,0,0,0,22.172834,-28.977781
4,1.31619,1,0,0,0,0,0,0,0,22.175138,-28.977433
7,5.52922,1,0,0,0,0,0,0,0,22.175166,-28.973708


In [16]:
# Create a KDTree from the farms in the training set
# Use earth's radius in meters
train_kdtree = KDTree(train_df[['lat','lon']].values, distance_metric='ARC', radius=RADIUS_EARTH_KM*1000)

coords = tuple(test_df.iloc[0][['lat','lon']])

In [17]:
train_kdtree.query_ball_point(coords, r=500)

[2245, 1, 3, 4, 5, 6, 12, 2248, 2493, 2, 47]

In [31]:
import time
from multiprocessing import Pool
from functools import partial

nn_count_dist = 500

def get_neighbours(farm_id, dataset, k):
    """
    Get the neighbours in the training set
    """
    
    train = dataset=='train'
    test = dataset=='test'
    
    # Get the coordinates of this farm
    if train:
        coords = tuple(train_df[['lat','lon']].loc[farm_id])
    elif test:
        coords = tuple(test_df[['lat','lon']].loc[farm_id])
    else:
        raise ValueError('Cannot find ID: {} in dataset: {}'.format(farm_id, dataset))
    
    # Get closest farms
    distances, indexes = train_kdtree.query(coords, k=k+1)
    
    # The training set will always return this farm as the first (closest) farm
    # So need to remove it from the list
    if train:
        distances = distances[1:]
        indexes = indexes[1:]
    else:
        distances = distances[:-1]
        indexes = indexes[:-1]
    

    neighbours_df = train_df[['Area']].iloc[indexes]
    neighbours_df['distance'] = distances

    # Some index magic
    neighbours_df.index=map(str,range(len(neighbours_df)))

    # Flatten the neighbours data
    n_flat = neighbours_df.unstack().to_frame().T
    n_flat.columns = n_flat.columns.map('_'.join)

    labels_count = labels_one_hot.iloc[indexes].sum(axis=0)
    # Use name 0 for merging with neighbours data
    labels_count.name = 0
    
    result = n_flat.join(labels_count.to_frame().T)
    
    # Get the number of neighbours within a certain radius
    result['nn_count'] = len(train_kdtree.query_ball_point(coords, r=nn_count_dist))
    
    result.index = [farm_id]

    return result
    

## Multi process all the things!

In [33]:
def knn_features(ids_list, dataset, k):
    
    partials = partial(get_neighbours, dataset=dataset, k=k)
    
    pool = Pool(processes=10)
    result = pool.map_async(partials, ids_list)

    i = 0
    t_start = time.time()
    while not result.ready():
        if i == 10:
            i=0
            e = int(time.time() - t_start)
            print('\n Elapsed: {:02d}:{:02d}:{:02d}'.format(e // 3600, (e % 3600 // 60), e % 60))
        print('.',end='')
        i += 1
        time.sleep(0.5)
        
    df = pd.concat(result.get(), axis=0)
    
    return df

## Generate KNN Features

### Run for train set

In [34]:
%%time

k=10

print('-'*25,'Train','-'*25)
train_ids = train_df.index.to_list()
train_knn_features = knn_features(train_ids, dataset='train', k=k)

print('\n\n')
print('-'*25,'Test','-'*25)
test_ids = test_df.index.to_list()
test_knn_features = knn_features(test_ids, dataset='test', k=k)

------------------------- Train -------------------------
..........
 Elapsed: 00:00:05
..........
 Elapsed: 00:00:10
..........
 Elapsed: 00:00:15
..........
 Elapsed: 00:00:20
..........
 Elapsed: 00:00:25
..........
 Elapsed: 00:00:30
..........
 Elapsed: 00:00:35
..........
 Elapsed: 00:00:40
..........
 Elapsed: 00:00:45
..........
 Elapsed: 00:00:50
..........
 Elapsed: 00:00:55
..........
 Elapsed: 00:01:00
..........
 Elapsed: 00:01:05
..........
 Elapsed: 00:01:10
..........
 Elapsed: 00:01:15
....


------------------------- Test -------------------------
..........
 Elapsed: 00:00:05
..........
 Elapsed: 00:00:10
..........
 Elapsed: 00:00:15
..........
 Elapsed: 00:00:20
..........
 Elapsed: 00:00:25
..........
 Elapsed: 00:00:30
.......CPU times: user 5.74 s, sys: 653 ms, total: 6.39 s
Wall time: 1min 52s


## Join KNN Features with others

In [35]:
pd.set_option('display.max_columns',None)

train_features_data = train_df.join(train_knn_features)
test_features_data = test_df.join(test_knn_features)

# Show head of train_features
train_features_data.head()

Unnamed: 0_level_0,Area,Subregion_3.0,Subregion_1.0,Subregion_4.0,Subregion_2.0,Subregion_5.0,Subregion_6.0,Subregion_0.0,Subregion_7.0,lat,lon,Area_0,Area_1,Area_2,Area_3,Area_4,Area_5,Area_6,Area_7,Area_8,Area_9,distance_0,distance_1,distance_2,distance_3,distance_4,distance_5,distance_6,distance_7,distance_8,distance_9,Cotton,Dates,Grass,Lucern,Maize,Pecan,Vacant,Vineyard,"Vineyard & Pecan (""Intercrop"")",nn_count
Field_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
1,0.915905,1,0,0,0,0,0,0,0,22.170005,-28.977219,1.0803,0.774865,1.81556,4.62284,2.06441,5.82118,1.31619,1.59031,2.03598,1.43812,282.149095,309.726013,357.634133,443.962361,447.60952,467.795875,499.838735,582.179327,589.531139,606.000154,2,0,0,1,0,2,0,4,1,8
2,2.06441,1,0,0,0,0,0,0,0,22.174372,-28.97595,1.81556,1.43812,1.31619,4.62284,1.0803,5.52922,1.48271,0.774865,2.03598,0.915905,90.197077,163.249105,180.910239,197.627907,252.603794,261.082736,317.420623,364.403883,440.843881,447.60952,0,0,0,1,0,1,0,7,1,13
3,1.0803,1,0,0,0,0,0,0,0,22.172834,-28.977781,1.81556,1.31619,2.06441,5.82118,0.915905,4.62284,0.774865,1.43812,1.59031,5.52922,182.14984,227.423921,252.603794,255.491683,282.149095,373.366213,400.470025,410.444118,483.619231,506.554948,2,0,0,1,0,2,0,4,1,10
4,1.31619,1,0,0,0,0,0,0,0,22.175138,-28.977433,2.06441,1.81556,1.0803,1.43812,5.82118,4.62284,5.52922,1.48271,0.915905,0.774865,180.910239,205.428381,227.423921,264.680854,316.604451,377.641327,414.274624,426.015864,499.838735,515.230647,1,0,0,1,0,2,0,5,1,10
7,5.52922,1,0,0,0,0,0,0,0,22.175166,-28.973708,1.48271,1.43812,4.62284,2.06441,1.01882,2.43183,2.03598,1.81556,1.31619,0.774865,131.300151,166.446162,200.212759,261.082736,293.414491,293.785881,304.353489,325.649965,414.274624,418.595577,0,0,0,1,0,2,0,6,1,12


In [36]:
test_features_data.head()

Unnamed: 0_level_0,Area,Subregion_3.0,Subregion_1.0,Subregion_4.0,Subregion_2.0,Subregion_5.0,Subregion_6.0,Subregion_0.0,Subregion_7.0,lat,lon,Area_0,Area_1,Area_2,Area_3,Area_4,Area_5,Area_6,Area_7,Area_8,Area_9,distance_0,distance_1,distance_2,distance_3,distance_4,distance_5,distance_6,distance_7,distance_8,distance_9,Cotton,Dates,Grass,Lucern,Maize,Pecan,Vacant,Vineyard,"Vineyard & Pecan (""Intercrop"")",nn_count
Field_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
5,1.32259,1,0,0,0,0,0,0,0,22.175366,-28.975957,2.06441,1.43812,1.31619,1.81556,5.52922,4.62284,1.48271,1.0803,0.774865,5.82118,96.635134,101.000271,165.585912,183.271898,250.919111,262.698218,265.012696,318.987475,454.097433,475.889249,1,0,0,1,0,2,0,5,1,11
6,0.955864,1,0,0,0,0,0,0,0,22.176267,-28.976035,1.43812,2.06441,1.31619,1.48271,1.81556,5.52922,4.62284,1.0803,5.82118,0.774865,113.992114,184.521424,190.338572,250.16467,269.036333,280.058308,339.218991,386.243744,505.783167,539.882692,1,0,0,1,0,2,0,5,1,8
10,11.5098,1,0,0,0,0,0,0,0,22.174307,-28.97102,2.43183,1.01882,1.79607,2.03598,0.565466,1.7569,2.65521,5.52922,0.158009,1.48271,152.464906,162.754878,222.509808,235.115004,265.79142,271.755752,275.059933,310.293927,356.814744,375.855091,0,0,0,0,0,0,0,10,0,16
18,2.69752,1,0,0,0,0,0,0,0,21.893169,-28.843048,2.58801,2.48277,4.37185,3.45035,0.704917,3.28571,2.77701,2.89924,4.27254,0.662677,114.211898,176.870448,182.228966,248.91521,285.444299,333.342882,372.171251,397.688722,466.598964,492.124055,0,0,0,0,0,2,0,7,1,11
23,4.23803,1,0,0,0,0,0,0,0,21.895576,-28.84023,3.28571,2.48771,2.21751,1.25042,0.828373,0.704917,2.89924,2.48277,1.20904,2.67864,138.464816,142.017716,153.927274,211.03269,212.195831,219.345351,220.395867,222.542939,251.93832,271.0303,0,0,2,0,1,1,1,4,1,21


In [37]:
test_features_data.shape

(1074, 41)

### Cluster lat long

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
# TODO

## Save Raw Dataset

In [38]:
from pathlib import Path

from config import processed_data_dir

out_dir = processed_data_dir / 'meta_data' / dataset_version
safe_create_dir(out_dir)

train_features_data.to_csv(out_dir / 'train.csv')
test_features_data.to_csv(out_dir / 'test.csv')

In [39]:
# Save to references for Stefan :D
ref_dir = Path('../references/')
out_dir = ref_dir / 'meta_data' / dataset_version
safe_create_dir(out_dir)

train_features_data.to_csv(out_dir / 'train.csv')
test_features_data.to_csv(out_dir / 'test.csv')

## Dataset Processing 

### Feature Scaling

In [40]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

cols = ['Area', 'lat', 'lon'] + [c for c in train_features_data.columns if 'distance' in c] + ['Cotton','Dates','Grass','Lucern','Maize','Pecan','Vacant','Vineyard & Pecan ("Intercrop")']

train_features_data[cols] = scaler.fit_transform(train_features_data[cols])
test_features_data[cols] = scaler.transform(test_features_data[cols])

train_features_data[cols].head()

Unnamed: 0_level_0,Area,lat,lon,distance_0,distance_1,distance_2,distance_3,distance_4,distance_5,distance_6,distance_7,distance_8,distance_9,Cotton,Dates,Grass,Lucern,Maize,Pecan,Vacant,"Vineyard & Pecan (""Intercrop"")"
Field_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.589766,1.739183,-1.652844,2.459255,1.570768,1.426737,1.721516,1.299743,1.096452,0.995017,1.2638,1.034786,0.906862,0.990434,-0.146653,-0.482659,-0.58047,-0.672012,1.226142,-0.779975,0.889687
2,-0.157671,1.761063,-1.646056,-0.55707,-0.156189,-0.379945,-0.511104,-0.304332,-0.472433,-0.283296,-0.170441,0.119416,-0.019905,-0.412259,-0.146653,-0.482659,-0.58047,-0.672012,0.357886,-0.779975,0.889687
3,-0.527917,1.753355,-1.655849,0.887871,0.600429,0.352992,0.013337,-0.061299,0.379761,0.298681,0.132774,0.382756,0.324994,0.990434,-0.146653,-0.482659,-0.58047,-0.672012,1.226142,-0.779975,0.889687
4,-0.439169,1.764898,-1.65399,0.868392,0.341103,0.095573,0.096621,0.222124,0.412208,0.395418,0.235328,0.482609,0.375756,0.289088,-0.146653,-0.482659,-0.58047,-0.672012,1.226142,-0.779975,0.889687
7,1.145875,1.765042,-1.634054,0.088822,-0.118496,-0.182612,0.06401,0.031368,-0.224227,-0.374865,-0.425669,-0.044154,-0.189669,-0.412259,-0.146653,-0.482659,-0.58047,-0.672012,1.226142,-0.779975,0.889687


### Save Scaled Data

In [41]:
out_dir = processed_data_dir / 'meta_data' / dataset_version
safe_create_dir(out_dir)

train_features_data.to_csv(out_dir / 'train_scaled.csv')
test_features_data.to_csv(out_dir / 'test_scaled.csv')


# Save to references for Stefan :D
ref_dir = Path('../references/')
out_dir = ref_dir / 'meta_data' / dataset_version
safe_create_dir(out_dir)

train_features_data.to_csv(out_dir / 'train_scaled.csv')
test_features_data.to_csv(out_dir / 'test_scaled.csv')