In [1]:
import geopandas as gpd
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Whole tiles
tiles_df = gpd.read_file('output\covering_tiles.json')
tiles_df.columns

Index(['TileName', 'geometry'], dtype='object')

In [4]:
# Match tiles to fields
tiles_to_fields_df = pd.read_csv('output/tiles_to_fields.csv')
tiles_to_fields_df = tiles_to_fields_df.dropna()
tiles_to_fields_df["FieldId"] = tiles_to_fields_df["FieldId"].astype('int32')
tiles_to_fields_df["FieldId"].sort_values()
tiles_counts = tiles_to_fields_df.groupby('TileName').size()
tiles_counts = tiles_counts.to_frame().reset_index()
tiles_counts = tiles_counts.rename(columns={0:'FieldCount'})
join_df = tiles_counts.merge(tiles_df)
join_df = join_df.sort_values('FieldCount', ascending=False)
join_df

Unnamed: 0,TileName,FieldCount,geometry
491,17TKE,13113,"POLYGON Z ((-84.54546 40.59640 0.00000, -83.24..."
436,16TGK,12987,"POLYGON Z ((-84.63579 40.62665 0.00000, -83.33..."
492,17TKF,6986,"POLYGON Z ((-84.59412 41.49565 0.00000, -83.28..."
437,16TGL,6982,"POLYGON Z ((-84.60330 41.52686 0.00000, -83.28..."
455,17SKD,6834,"POLYGON Z ((-84.49898 39.69751 0.00000, -83.21..."
...,...,...,...
114,13SGA,1,"POLYGON Z ((-102.75207 37.02528 0.00000, -101...."
1028,37VCD,1,"POLYGON Z ((35.64383 57.69781 0.00000, 37.4851..."
745,30VWJ,1,"POLYGON Z ((-3.00034 57.74230 0.00000, -1.1565..."
1030,37VDD,1,"POLYGON Z ((37.32001 57.73116 0.00000, 39.1639..."


In [12]:
from reader import load_fields
rewrite=True
if rewrite:
    i = 0
    skip = 1
    max_rows_per_chunk = 10000
    input_file = 'input/results-all.txt'
    while True:
        with open(input_file, encoding='utf-16') as f:
            column_names = f.readline().strip().split('\t')
        fields_df = load_fields(input_file, skip, max_rows_per_chunk, column_names)
        if len(fields_df) == 0:
            break
        else:
            fields_df.drop(columns=['geometry']).to_csv(f'tmp/split{i}-{max_rows_per_chunk}.csv', index=False, sep='\t')
            i += 1
            skip += len(fields_df)

Parsing WKT...
9997 fields
Parsing WKT...
9878 fields
Parsing WKT...
9918 fields
Parsing WKT...
10000 fields
Parsing WKT...
9939 fields
Parsing WKT...
9991 fields
Parsing WKT...
9995 fields
Parsing WKT...
10000 fields
Parsing WKT...
9999 fields
Parsing WKT...
9993 fields
Parsing WKT...
10000 fields
Parsing WKT...
9995 fields
Parsing WKT...
9998 fields
Parsing WKT...
10000 fields
Parsing WKT...
9988 fields
Parsing WKT...
10000 fields
Parsing WKT...
9999 fields
Parsing WKT...
9981 fields
Parsing WKT...
10000 fields
Parsing WKT...
9891 fields
Parsing WKT...
9408 fields
Parsing WKT...
42 fields
Parsing WKT...
0 fields


In [15]:
top_k_field_counts = join_df.sort_values('FieldCount', ascending=False).head(10)
collect_tiles = top_k_field_counts["TileName"].values.tolist()
print(len(collect_tiles), 'auto tiles:', collect_tiles)

# Override with manual tiles to reduce overlapping
# manual_collect_tiles = ['17TLF', '16TGL', '17TKE', '17SKD', '16TFL']
manual_collect_tiles = ['16TFL']
collect_tiles = manual_collect_tiles

print(len(collect_tiles), 'tiles:', collect_tiles)
collect_fields = tiles_to_fields_df[tiles_to_fields_df["TileName"].isin(collect_tiles)]
print(len(collect_fields), 'fields')
unique_collect_fields = set(collect_fields["FieldId"])
print(len(unique_collect_fields), 'unique fields')

10 auto tiles: ['17TKE', '16TGK', '17TKF', '16TGL', '17SKD', '16SGJ', '17TLF', '15TVG', '16TFL', '18TUN']
1 tiles: ['16TFL']
3348 fields
3348 unique fields


In [19]:
from pathlib import Path
csvs = list(sorted(Path('tmp').glob('*.csv'), key=lambda fname: str(fname)))
print('files:', csvs)
# csvs = csvs[:1]  # NOTE: test
all_fields_df = None
def filter_to_collect(df):
    print('filter starts with', len(df), 'rows')
    df = df[df["FieldId"].isin(unique_collect_fields)]
    print('filter ends with', len(df), 'rows')
    return df
for fname in csvs:
    fields_df = load_fields(fname, 0, None, None, encoding='utf-8', filter_fn=filter_to_collect)
    print(len(fields_df), 'fields in', fname)
    if all_fields_df is None:
        all_fields_df = fields_df
    else:
        all_fields_df = pd.concat((all_fields_df, fields_df), ignore_index=True).reset_index(drop=True)
print(len(all_fields_df), 'fields in total')
all_fields_df.to_file(driver = 'ESRI Shapefile', filename= "tmp/field_polygons.shp")


files: [WindowsPath('tmp/split0-10000.csv'), WindowsPath('tmp/split1-10000.csv'), WindowsPath('tmp/split10-10000.csv'), WindowsPath('tmp/split11-10000.csv'), WindowsPath('tmp/split12-10000.csv'), WindowsPath('tmp/split13-10000.csv'), WindowsPath('tmp/split14-10000.csv'), WindowsPath('tmp/split15-10000.csv'), WindowsPath('tmp/split16-10000.csv'), WindowsPath('tmp/split17-10000.csv'), WindowsPath('tmp/split18-10000.csv'), WindowsPath('tmp/split19-10000.csv'), WindowsPath('tmp/split2-10000.csv'), WindowsPath('tmp/split20-10000.csv'), WindowsPath('tmp/split21-10000.csv'), WindowsPath('tmp/split3-10000.csv'), WindowsPath('tmp/split4-10000.csv'), WindowsPath('tmp/split5-10000.csv'), WindowsPath('tmp/split6-10000.csv'), WindowsPath('tmp/split7-10000.csv'), WindowsPath('tmp/split8-10000.csv'), WindowsPath('tmp/split9-10000.csv')]
filter starts with 9997 rows
filter ends with 10 rows
Parsing WKT...
10 fields
10 fields in tmp\split0-10000.csv
filter starts with 9878 rows
filter ends with 359 row

In [21]:
gpd.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw'
all_fields_df.to_file(driver = 'KML', filename= "tmp/field_polygons.kml")