In [1]:
from prepare_data import prepare_data, crop_id2name


bands = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12']
train_data, field_crop_pair, test_data = prepare_data(bands)

1165it [01:08, 17.01it/s]
1165it [00:26, 44.50it/s]
707it [00:16, 43.82it/s]


In [2]:
import pandas as pd

train_data_grouped = train_data.groupby(['field_id']).mean().reset_index()
train_data_grouped.field_id = [str(int(i)) for i in train_data_grouped.field_id.values]
train_df = pd.merge(train_data_grouped, field_crop_pair, on='field_id')

test_data_grouped = test_data.groupby(['field_id']).mean().reset_index()
test_data_grouped.field_id = [str(int(i)) for i in test_data_grouped.field_id.values]

In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

X = train_df.drop(['field_id', 'crop_id'], axis=1).to_numpy()
y = le.fit_transform(train_df['crop_id'].to_numpy())

X_test = test_data_grouped.drop('field_id', axis=1).to_numpy()
field_id_test = test_data_grouped.field_id.to_numpy()

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import GridSearchCV

In [5]:
clf = RandomForestClassifier(random_state=42)
param_grid = {'n_estimators': [30, 60, 90],
              'max_depth': [3, 5, 7, 9]}

clf_cv = GridSearchCV(estimator=clf,
                      param_grid=param_grid, 
                      scoring='neg_log_loss',
                      cv=4,
                      verbose=False)
clf_cv.fit(X, y)
print(clf_cv.best_score_)
print(clf_cv.best_params_)

clf0 = RandomForestClassifier(**clf_cv.best_params_,
                             random_state=42)
clf0.fit(X, y);

-1.386769568659965
{'max_depth': 3, 'n_estimators': 90}


In [6]:
clf = DecisionTreeClassifier(random_state=42)

param_grid = {'criterion':['gini', 'entropy'],
              'max_features': ['sqrt', 'log2'],
              'ccp_alpha': [0.1, .01, .001],
              'max_depth': [3, 5, 6, 7, 9]}

clf_cv = GridSearchCV(estimator=clf,
                      param_grid=param_grid,
                      cv=4,
                      scoring='neg_log_loss',
                      verbose=False)
clf_cv.fit(X, y)
print(clf_cv.best_score_)
print(clf_cv.best_params_)

clf1 = DecisionTreeClassifier(**clf_cv.best_params_,
                             random_state=42)
clf1.fit(X, y);

-1.4810689957732235
{'ccp_alpha': 0.1, 'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt'}


In [7]:
clf = MLPClassifier(random_state=42)

param_grid = {'hidden_layer_sizes': [300, 400, 500],
              'solver': ['sgd', 'adam'],
              'max_iter': [500]}

clf_cv = GridSearchCV(estimator=clf,
                      param_grid=param_grid, 
                      scoring='neg_log_loss',
                      cv=4,
                      verbose=False)
clf_cv.fit(X, y)
print(clf_cv.best_score_)
print(clf_cv.best_params_)

clf2 = MLPClassifier(**clf_cv.best_params_,
                             random_state=42)
clf2.fit(X, y);

-1.4830928494679942
{'hidden_layer_sizes': 300, 'max_iter': 500, 'solver': 'sgd'}


In [8]:
clf = KNeighborsClassifier()

param_grid = {'n_neighbors': [650, 700, 750]}

clf_cv = GridSearchCV(estimator=clf,
                      param_grid=param_grid, 
                      scoring='neg_log_loss',
                      cv=4,
                      verbose=False)
clf_cv.fit(X, y)
print(clf_cv.best_score_)
print(clf_cv.best_params_)

clf3 = KNeighborsClassifier(**clf_cv.best_params_)
clf3.fit(X, y);

-1.4268215941363347
{'n_neighbors': 750}


In [9]:
clf = XGBClassifier(random_state=42)

param_grid = {'max_depth': [3, 5, 7],
              'n_estimators': [50, 100],
              'learning_rate': [0.1, 0.01, 0.05]}

clf_cv = GridSearchCV(estimator=clf,
                      param_grid=param_grid, 
                      scoring='neg_log_loss',
                      cv=4,
                      verbose=False)
clf_cv.fit(X, y)
print(clf_cv.best_score_)
print(clf_cv.best_params_)

clf4 = XGBClassifier(**clf_cv.best_params_, random_state=42)
clf4.fit(X, y);

-1.5661601413949027
{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 50}


In [10]:
clf = LGBMClassifier(random_state=42)

param_grid = {'max_depth': [3, 4, 5],
              'n_estimators': [70, 100, 130],
              'learning_rate': [0.1, 0.01, 0.05]}

clf_cv = GridSearchCV(estimator=clf,
                      param_grid=param_grid, 
                      scoring='neg_log_loss',
                      cv=4,
                      verbose=False)
clf_cv.fit(X, y)
print(clf_cv.best_score_)
print(clf_cv.best_params_)

clf5 = LGBMClassifier(**clf_cv.best_params_, random_state=42)
clf5.fit(X, y);

-1.3687950898579901
{'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 70}


In [35]:
clfs = [clf0, clf1, clf2, clf3, clf4, clf5]
y_test_preds = []
for clf in clfs:
    clf.fit(X, y)
    y_test_preds.append(clf.predict_proba(X_test))

In [36]:
import numpy as np

y_test_pred = np.stack(y_test_preds, 0).mean(0)

In [37]:
crop_columns = [crop_id2name[le.inverse_transform(np.array([i]))[0]] for i in clf.classes_]

test_df  = pd.DataFrame(columns= ['field_id'] + crop_columns)

test_df['field_id'] = field_id_test
test_df[crop_columns]= y_test_pred 

test_df.to_csv('submissions/submission.csv', index=False)