In [597]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.impute import KNNImputer

# Classificadores:
from sklearn.neighbors import NearestNeighbors

## 1) Loading Data

In [631]:
df_meteorologia = pd.read_csv("..\Dados\Views\dados_meteorologicos.csv", index_col=0)
df_produto_agricola = pd.read_csv(R"..\Dados\Tabela_final\dados_producao_agricola.csv", index_col=0)
df_municipios_sertao = pd.read_csv(r"..\Dados\Views\municipios_sertao.csv", index_col=0)
df_transporte = pd.read_csv(r"..\Dados\Views\custo_de_transporte.csv", index_col=0)
df_recursos_hidricos = pd.read_csv(r"..\Dados\Views\recursos_hidricos.csv", index_col=0)
df_solo = pd.read_csv(r"..\Dados\Views\solos_municipios.csv", index_col=0)

In [632]:
df_classificador = df_municipios_sertao[["IBGE7", "NOME", "LATITUDE", "LONGITUDE"]].copy()

In [633]:
df_classificador.head()

Unnamed: 0,IBGE7,NOME,LATITUDE,LONGITUDE
0,2700300,ARAPIRACA,-9.75487,-36.6615
1,2700706,BATALHA,-9.6742,-37.133
2,2700904,BELO MONTE,-9.82272,-37.277
3,2701209,CACIMBINHAS,-9.40121,-36.9911
4,2701605,CANAPI,-9.11932,-37.5967


In [634]:
# Merges com o dataframe do classificador:
df_classificador = df_classificador.merge(df_transporte, on="IBGE7", how="left")
df_classificador = df_classificador.merge(df_meteorologia, on="IBGE7", how="left")
df_classificador = df_classificador.merge(df_recursos_hidricos[["IBGE7", "AREA_IRRIGADA_TOT"]], on="IBGE7", how="left")

# Merge do dataframe de produtos agrícolas com o de municípios do sertão:
df_produto_agricola = df_produto_agricola.merge(df_municipios_sertao[["IBGE7", "NOME"]], how="inner", on="NOME")

In [637]:
df_solo.head()

Unnamed: 0,IBGE7,SOLO,AREA_TOTAL
0,2207959,LAd - Latossolos Amarelos Distroficos,64739.067171
1,2207959,PVAe - Argissolos Vermelho-Amarelos Eutroficos,4865.805456
2,2207959,RQo - Neossolos Quartzarenicos Orticos,24660.252674
3,2207934,PVAe - Argissolos Vermelho-Amarelos Eutroficos,23145.966454
4,2207934,RQo - Neossolos Quartzarenicos Orticos,65225.223645


In [603]:
df_produto_agricola["PRODUTO"].unique()

array(['ALGODAO HERBACEO (EM CAROCO)', 'AMENDOIM (EM CASCA)', 'BANANA ',
       'CASTANHA DE CAJU', 'FAVA (EM GRAO)', 'FEIJAO (EM GRAO)',
       'MANDIOCA', 'MANGA', 'MILHO (EM GRAO)', 'TOMATE',
       'CAFE (EM GRAO) ARABICA', 'CAFE (EM GRAO) TOTAL', 'MELANCIA',
       'MELAO', 'BATATA-DOCE', 'BATATA-INGLESA',
       'CAFE (EM GRAO) CANEPHORA', 'URUCUM ', 'CACAU (EM AMENDOA)',
       'SOJA (EM GRAO)', 'SORGO (EM GRAO)', 'MAMONA ', 'UVA',
       'PIMENTA-DO-REINO', 'TRIGO (EM GRAO)', 'GUARANA '], dtype=object)

In [604]:
rendimentos_max = df_produto_agricola.groupby(["NOME"]).agg({"REND_MEDIO":"max"}).reset_index()
df_agro = rendimentos_max.merge(df_produto_agricola[["NOME", "IBGE7", "REND_MEDIO", "PRODUTO", "VALOR_PROD"]], on=["NOME", "REND_MEDIO"], how="inner")
df_agro = df_agro.drop_duplicates(["NOME", "REND_MEDIO", "PRODUTO"])
max = df_agro.groupby("NOME")["VALOR_PROD"].idxmax()
df_agro_max = df_agro.loc[max].reset_index(drop=True)
df_agro_max.drop(columns=["NOME"], inplace=True)
df_classificador = df_classificador.merge(df_agro_max, on="IBGE7", how="inner")
df_classificador.drop(columns=["REND_MEDIO", "VALOR_PROD"], inplace=True)

In [609]:
df_classificador.head()

Unnamed: 0,IBGE7,NOME,LATITUDE,LONGITUDE,TRANSPORT_COST,PREC_MED,RED_MED,TEMP_MED,VEL_MED,ALTITUDE,AREA_IRRIGADA_TOT,PRODUTO
0,2700300,ARAPIRACA,-9.75487,-36.6615,24042.166072,988.689803,1051.169614,26.15705,1.553386,236.84,327.176525,TOMATE
1,2700706,BATALHA,-9.6742,-37.133,24257.110578,713.023256,1318.900553,27.754333,1.427155,20.83,0.0,MELANCIA
2,2700904,BELO MONTE,-9.82272,-37.277,24226.863743,713.023256,1318.900553,27.754333,1.427155,20.83,0.0,BATATA-DOCE
3,2701209,CACIMBINHAS,-9.40121,-36.9911,24358.206538,209.280265,1585.450518,26.482644,1.062182,278.01,0.0,MILHO (EM GRAO)
4,2701605,CANAPI,-9.11932,-37.5967,24734.221802,551.553841,1437.777193,27.047011,2.518969,187.0,0.0,TOMATE


## 2) Préprocessamento

In [612]:
X = df_classificador.drop(columns=["IBGE7", "NOME", "PRODUTO"])
y = df_classificador["PRODUTO"].values

In [613]:
# Onehot ecoding:
one_hot = OneHotEncoder()
y_one_hot = one_hot.fit_transform(y.reshape(-1, 1)).toarray()
labels = one_hot.get_feature_names_out()

In [614]:
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)

In [615]:
colunas_num = X_train.columns 

In [616]:
imputer = KNNImputer(n_neighbors=5)
X_train_imputado = imputer.fit_transform(X_train)
X_test_imputado = imputer.transform(X_test)

X_train[colunas_num] = X_train_imputado
X_test[colunas_num] = X_test_imputado

In [617]:
min_max = MinMaxScaler()
X_train_min_max = min_max.fit_transform(X_train)
X_test_min_max = min_max.transform(X_test)

## 3) Treinamento do KNN

In [618]:
k_neighburs = 5 
knn = NearestNeighbors(n_neighbors=k_neighburs, n_jobs=-1, metric="cosine")
knn.fit(X_train_min_max)

In [624]:
id_municipio = 2512606
distance, neighbours_indices = knn.kneighbors(X_test_min_max[0].reshape(1, -1), n_neighbors=k_neighburs)

In [625]:
distance

array([[8.66684543e-06, 1.09730299e-05, 3.29966589e-05, 3.63206756e-05,
        5.22502409e-05]])

In [626]:
neighbours_indices

array([[860, 357,  46, 199, 457]], dtype=int64)

In [627]:
df_classificador.iloc[neighbours_indices[0]]

Unnamed: 0,IBGE7,NOME,LATITUDE,LONGITUDE,TRANSPORT_COST,PREC_MED,RED_MED,TEMP_MED,VEL_MED,ALTITUDE,AREA_IRRIGADA_TOT,PRODUTO
860,2613008,SAO BENTO DO UNA,-8.52637,-36.4465,24657.500838,1324.505535,992.260298,21.119463,2.312078,827.78,0.0,MANDIOCA
357,2303709,CAUCAIA,-3.72797,-38.6619,28155.384992,1754.376872,1014.405638,27.374685,2.400769,29.55,130.488685,MANDIOCA
46,2901403,ANGICAL,-12.0063,-44.7003,26048.498456,663.02494,1215.61717,25.943216,0.488259,474.17,44.0,MANDIOCA
199,2920502,MARACAS,-13.4355,-40.4323,23519.077869,1113.116788,945.7779,20.775875,1.939564,757.42,381.956725,TOMATE
457,2311603,REDENCAO,-4.21587,-38.7277,27899.333585,613.405634,1032.708302,21.698549,2.567818,865.53,0.10593,MANDIOCA


In [628]:
df_classificador[df_classificador["IBGE7"] == id_municipio]

Unnamed: 0,IBGE7,NOME,LATITUDE,LONGITUDE,TRANSPORT_COST,PREC_MED,RED_MED,TEMP_MED,VEL_MED,ALTITUDE,AREA_IRRIGADA_TOT,PRODUTO
710,2512606,QUIXABA,-7.0224,-37.1458,25753.976661,357.016932,1868.627166,29.75646,1.590968,263.66,0.618036,BANANA
