In [875]:
import random
import numpy as np


seed = 3
random.seed(seed)
np.random.seed(seed)

# Help Function

In [876]:
import json


def unpack_geo(data):
    geo = np.array([json.loads(son) for son in np.array(data['.geo'])])
    field_x = np.zeros(len(geo))
    field_y = np.zeros(len(geo))

    for i, field in enumerate(geo):
        center = []
        if field['type'] == 'Polygon':
            for coordinates in field['coordinates']:
                center += coordinates
            
        if field['type'] == 'MultiPolygon':
            for coordinates in field['coordinates']:
                for coord in coordinates:
                    center += coord

        if field['type'] == 'GeometryCollection':
            for element in field['geometries']:
                if element['type'] == 'LineString':
                    center += element['coordinates']

                if element['type'] == 'Polygon':
                    for coordinates in element['coordinates']:
                        center += coordinates

        center = torch.tensor(center).reshape(-1, 2).mean(dim=0)
        field_x[i] = float(center[0])
        field_y[i] = float(center[1])

    return field_x, field_y

In [877]:
import json


def unpack_geo2(data):
    geo = np.array([json.loads(son) for son in np.array(data['.geo'])])
    n = len(geo)
    field_type = {'Polygon': np.zeros(n), 
                  'MultiPolygon': np.zeros(n), 
                  'GeometryCollection': np.zeros(n)}

    for i, field in enumerate(geo):
        field_type[field['type']][i] = 1

    return field_type

In [878]:
def miss(data):
    missing = data.isin([0]).sum()
    mis_per = 100 * missing / len(data)
    mis_table = pd.DataFrame({"Miss count": missing, "Miss percent": mis_per}).sort_values(by = "Miss percent", ascending=False)
    print(mis_table[mis_table['Miss count'] != 0])

In [879]:
def fill_gaps(data, nepoch=3):
    data_np = np.array(data)

    for epoch in range(nepoch):
        for i, field in enumerate(data_np):
            for j, nd in enumerate(field):
                if nd == 0:
                    if j == 0:
                        data_np[i][j] = field[j+1]
                        continue

                    if j == len(field) - 1:
                        data_np[i][j] = field[j-1]
                        continue

                    data_np[i][j] = 0.5 * (field[j+1] + field[j-1])
    return data_np

# Data

In [880]:
import torch
import pandas as pd


data = pd.read_csv('train.csv').drop(columns=['id'])
data_test = pd.read_csv('test.csv')
id = data_test['id']
data_test = data_test.drop(columns=['id'])

In [881]:
keys1 = ['crop', 'area', '.geo']
keys2 = [keys for keys in data.columns if keys not in keys1]
for keys in keys2:
    data = data[data[keys] >= 0]

In [882]:
#data[keys2] = fill_gaps(data[keys2], 3)
#data_test[keys2] = fill_gaps(data_test[keys2], 3)

#data[keys2] = data[keys2].transpose().replace(to_replace=0, method='ffill').transpose()
#data[keys2] = data[keys2].transpose().replace(to_replace=0, method='bfill').transpose()
#data_test[keys2] = data_test[keys2].transpose().replace(to_replace=0, method='ffill').transpose()
#data_test[keys2] = data_test[keys2].transpose().replace(to_replace=0, method='bfill').transpose()

In [883]:
keys2 = sorted(keys2)
data = data[keys1 + keys2]
data_test = data_test[['area', '.geo'] + keys2]

In [884]:
data['center_x'], data['center_y'] = unpack_geo(data)
data_test['center_x'], data_test['center_y'] = unpack_geo(data_test)

for key, array in unpack_geo2(data).items():
    data[key] = array
for key, array in unpack_geo2(data_test).items():
    data_test[key] = array

data = data.drop(columns=['.geo'])
data_test = data_test.drop(columns=['.geo'])

# Model

In [885]:
#!pip install catboost

In [886]:
from sklearn.model_selection import train_test_split


X_train, X_valid, y_train, y_valid = train_test_split(np.array(data.drop(columns='crop')),
                                                      np.array(data['crop']),
                                                      test_size=0.1,
                                                      shuffle=True,
                                                      stratify=np.array(data['crop']))

In [887]:
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import classification_report
import lightgbm
import catboost
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB


def score(model, X_valid, y_valid):
    preds = model.predict(X_valid)
    print(classification_report(y_valid, preds))
    print('recall: ', recall_score(y_valid, preds, average="macro", zero_division=0))


clf10_1 = lightgbm.LGBMClassifier()
clf10_2 = catboost.CatBoostClassifier()

clf10 = VotingClassifier(estimators=[('lgb', clf10_1), ('cbc', clf10_2)], voting='soft')
#clf10 = catboost.CatBoostClassifier()
#clf10 = lightgbm.LGBMClassifier()

clf10.fit(X_train, y_train)
score(clf10, X_valid, y_valid)

Learning rate set to 0.085248
0:	learn: 1.7364013	total: 91.9ms	remaining: 1m 31s
1:	learn: 1.5624018	total: 159ms	remaining: 1m 19s
2:	learn: 1.4357338	total: 236ms	remaining: 1m 18s
3:	learn: 1.3405662	total: 307ms	remaining: 1m 16s
4:	learn: 1.2557408	total: 378ms	remaining: 1m 15s
5:	learn: 1.1845561	total: 446ms	remaining: 1m 13s
6:	learn: 1.1103479	total: 517ms	remaining: 1m 13s
7:	learn: 1.0460078	total: 588ms	remaining: 1m 12s
8:	learn: 0.9927407	total: 657ms	remaining: 1m 12s
9:	learn: 0.9349004	total: 728ms	remaining: 1m 12s
10:	learn: 0.8885265	total: 797ms	remaining: 1m 11s
11:	learn: 0.8500860	total: 869ms	remaining: 1m 11s
12:	learn: 0.8118233	total: 943ms	remaining: 1m 11s
13:	learn: 0.7792640	total: 1.01s	remaining: 1m 11s
14:	learn: 0.7428812	total: 1.08s	remaining: 1m 11s
15:	learn: 0.7169441	total: 1.15s	remaining: 1m 10s
16:	learn: 0.6869746	total: 1.22s	remaining: 1m 10s
17:	learn: 0.6595062	total: 1.3s	remaining: 1m 10s
18:	learn: 0.6373313	total: 1.38s	remaining:

# LSTM+Dense

In [765]:
import torch


X_train_3d = torch.tensor(X_train).reshape(len(X_train), 1, -1).detach().numpy()
X_valid_3d = torch.tensor(X_valid).reshape(len(X_valid), 1, -1).detach().numpy()

In [766]:
from keras import layers
from keras.models import Sequential
from keras.optimizers import Adam
import tensorflow as tf
tf.random.set_seed(seed)

def score(model, X_valid, y_valid):
    preds = torch.tensor(model.predict(X_valid_3d)).argmax(dim=1).detach().numpy()
    print(classification_report(y_valid, preds))
    print('recall: ', recall_score(y_valid, preds, average="macro", zero_division=0))

model = Sequential()
model.add(layers.Input(shape=(None, 76)))
model.add(layers.LSTM(71*2, activation='sigmoid'))
#model.add(layers.Dense(64, activation='sigmoid'))
model.add(layers.Dropout(0.10))
model.add(layers.Dense(7, activation='sigmoid'))
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

model.fit(X_train_3d, y_train, epochs=60, batch_size=128)
score(model, X_valid_3d, y_valid)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60
              precision    recall  f1-score   support

           0       0.93      0.95      0.94        73
           1       1.00      1.00      1.00        70
           2       0.99      0.99      0.99        70
           3       1.00      1.00      1.00        65
           4      

# Result

In [767]:
X_test_3d = torch.tensor(np.array(data_test)).reshape(len(np.array(data_test)), 
                                                      1, -1).detach().numpy()
preds = torch.tensor(model.predict(X_test_3d)).argmax(dim=1).detach().numpy()
result = pd.DataFrame({'id': id, 'crop': preds})
result.to_csv('solution.csv', index=False)
result                                               



Unnamed: 0,id,crop
0,611,3
1,6417,2
2,3352,3
3,4224,1
4,3102,6
...,...,...
2066,1743,5
2067,3968,3
2068,3809,2
2069,5485,6


In [807]:
X_test = np.array(data_test)
preds = torch.tensor(clf10.predict(X_test)).reshape(-1)
result = pd.DataFrame({'id': id, 'crop': preds})
result.to_csv('solution.csv', index=False)
result

Unnamed: 0,id,crop
0,611,3
1,6417,2
2,3352,3
3,4224,1
4,3102,6
...,...,...
2066,1743,5
2067,3968,3
2068,3809,2
2069,5485,6
