In [194]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import KNNImputer

# Classificadores:
from sklearn.neighbors import NearestNeighbors

## 1) Loading Data

In [195]:
df_meteorologia = pd.read_csv("..\Dados\Views\dados_meteorologicos.csv", index_col=0)
df_produto_agricola = pd.read_csv(R"..\Dados\Tabela_final\dados_producao_agricola.csv", index_col=0)
df_municipios_sertao = pd.read_csv(r"..\Dados\Views\municipios_sertao.csv", index_col=0)
df_transporte = pd.read_csv(r"..\Dados\Views\custo_de_transporte.csv", index_col=0)
df_recursos_hidricos = pd.read_csv(r"..\Dados\Views\recursos_hidricos.csv", index_col=0)
df_solo = pd.read_csv(r"..\Dados\Views\solos_municipios.csv", index_col=0)

In [196]:
df_classificador = df_municipios_sertao[["IBGE7", "NOME", "LATITUDE", "LONGITUDE"]].copy()

In [197]:
df_classificador.head()

Unnamed: 0,IBGE7,NOME,LATITUDE,LONGITUDE
0,2700300,ARAPIRACA,-9.75487,-36.6615
1,2700706,BATALHA,-9.6742,-37.133
2,2700904,BELO MONTE,-9.82272,-37.277
3,2701209,CACIMBINHAS,-9.40121,-36.9911
4,2701605,CANAPI,-9.11932,-37.5967


In [198]:
# Merges com o dataframe do classificador:
df_classificador = df_classificador.merge(df_transporte, on="IBGE7", how="left")
df_classificador = df_classificador.merge(df_meteorologia, on="IBGE7", how="left")
df_classificador = df_classificador.merge(df_recursos_hidricos[["IBGE7", "AREA_IRRIGADA_TOT"]], on="IBGE7", how="left")

# Merge do dataframe de produtos agrícolas com o de municípios do sertão:
df_produto_agricola = df_produto_agricola.merge(df_municipios_sertao[["IBGE7", "NOME"]], how="inner", on="NOME")

In [199]:
df_solo.head()

Unnamed: 0,IBGE7,SOLO,AREA_TOTAL
0,2207959,LAd - Latossolos Amarelos Distroficos,64739.067171
1,2207959,PVAe - Argissolos Vermelho-Amarelos Eutroficos,4865.805456
2,2207959,RQo - Neossolos Quartzarenicos Orticos,24660.252674
3,2207934,PVAe - Argissolos Vermelho-Amarelos Eutroficos,23145.966454
4,2207934,RQo - Neossolos Quartzarenicos Orticos,65225.223645


In [200]:
df_produto_agricola["PRODUTO"].unique()

array(['ALGODAO HERBACEO (EM CAROCO)', 'AMENDOIM (EM CASCA)', 'BANANA ',
       'CASTANHA DE CAJU', 'FAVA (EM GRAO)', 'FEIJAO (EM GRAO)',
       'MANDIOCA', 'MANGA', 'MILHO (EM GRAO)', 'TOMATE',
       'CAFE (EM GRAO) ARABICA', 'CAFE (EM GRAO) TOTAL', 'MELANCIA',
       'MELAO', 'BATATA-DOCE', 'BATATA-INGLESA',
       'CAFE (EM GRAO) CANEPHORA', 'URUCUM ', 'CACAU (EM AMENDOA)',
       'SOJA (EM GRAO)', 'SORGO (EM GRAO)', 'MAMONA ', 'UVA',
       'PIMENTA-DO-REINO', 'TRIGO (EM GRAO)', 'GUARANA '], dtype=object)

In [201]:
# Obtendo o produto, respectivamente, de maior Valor e maior rendimento.
rendimentos_max = df_produto_agricola.groupby(["NOME"]).agg({"VALOR_PROD":"max"}).reset_index()
df_agro = rendimentos_max.merge(df_produto_agricola[["NOME", "IBGE7", "REND_MEDIO", "PRODUTO", "VALOR_PROD"]], on=["NOME", "VALOR_PROD"], how="inner")
df_agro = df_agro.drop_duplicates(["NOME", "VALOR_PROD", "PRODUTO"])
max = df_agro.groupby("NOME")["REND_MEDIO"].idxmax()
df_agro_max = df_agro.loc[max].reset_index(drop=True)
df_agro_max.drop(columns=["NOME"], inplace=True)
df_classificador = df_classificador.merge(df_agro_max, on="IBGE7", how="inner")
df_classificador.drop(columns=["REND_MEDIO", "VALOR_PROD"], inplace=True)
df_classificador.rename(columns={"PRODUTO": "PRODUTO_MAIOR_VALOR"},inplace=True)

In [202]:
# Obtendo solo de maior área dentro do município
df_solo_esparsa = df_solo.pivot_table(index="IBGE7", columns="SOLO", values="AREA_TOTAL").fillna(value=0).reset_index()
df_classificador = df_classificador.merge(df_solo_esparsa, how="inner", on="IBGE7")

## 2) Préprocessamento

In [203]:
X = df_classificador.drop(columns=["IBGE7", "NOME", "PRODUTO_MAIOR_VALOR"])

In [204]:
cat_columns = df_solo["SOLO"].unique()
colunas_num = [i for i in X.columns if i not in cat_columns]
colunas_log = [i for i in X.columns if (i not in cat_columns)&(i not in ["LATITUDE", "LONGITUDE"])]

In [205]:
# Aplicando log transformation para conter a variabilidade dos dados:
log_transformation = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)
X[colunas_log] = log_transformation.transform(X[colunas_log])

In [206]:
# Aplicando MinMax scaler para conter o range dos dados:
min_max = MinMaxScaler()
colunas_totais = X.columns
X[colunas_totais] = min_max.fit_transform(X[colunas_totais])

In [207]:
imputer = KNNImputer(n_neighbors=5)
X_imputado = pd.DataFrame(imputer.fit_transform(X))
X_imputado.columns = X.columns

In [208]:
X_imputado

Unnamed: 0,LATITUDE,LONGITUDE,TRANSPORT_COST,PREC_MED,RED_MED,TEMP_MED,VEL_MED,ALTITUDE,AREA_IRRIGADA_TOT,AGUA,...,RLe - Neossolos Litolicos Eutroficos,RQo - Neossolos Quartzarenicos Orticos,RRe - Neossolos Regoliticos Eutroficos,RYve - Neossolos Fluvicos Ta Eutroficos,SNo - Planossolos Natricos Orticos,SXe - Planossolos Haplicos Eutroficos,TCo - Luvissolos Cromicos Orticos,TCp - Luvissolos Cromicos Palicos,VEo - Vertissolos Ebanicos Orticos,VXo - Vertissolos Haplicos Orticos
0,0.529659,0.867505,0.295598,0.819860,0.661136,0.647071,0.410457,0.675412,0.538109,0.000000,...,0.000000,0.000000,0.000000,0.0,0.00000,0.000006,0.0,0.0,0.0,0.0
1,0.535109,0.820752,0.324910,0.724461,0.770893,0.770942,0.382409,0.215853,0.000000,0.000000,...,0.052826,0.000000,0.000000,0.0,0.00000,0.017522,0.0,0.0,0.0,0.0
2,0.525075,0.806473,0.320801,0.724461,0.770893,0.770942,0.382409,0.215853,0.000000,0.009459,...,0.064447,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0
3,0.553553,0.834822,0.338606,0.367246,0.859949,0.672902,0.292261,0.706131,0.000000,0.000000,...,0.000000,0.000000,0.015099,0.0,0.00000,0.078041,0.0,0.0,0.0,0.0
4,0.572598,0.774772,0.389055,0.649550,0.812645,0.716960,0.587898,0.630163,0.000000,0.000000,...,0.000000,0.000000,0.307012,0.0,0.00000,0.093297,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1221,0.476904,0.790509,0.274933,0.527338,0.703019,0.713979,0.660902,0.655088,0.000000,0.000000,...,0.035625,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0
1222,0.487106,0.795903,0.283623,0.547235,0.904856,0.744446,0.621853,0.701235,0.000000,0.000000,...,0.022745,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0
1223,0.463189,0.753651,0.278661,0.293501,0.789307,0.637230,0.188926,0.759405,0.330814,0.000000,...,0.085501,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0
1224,0.499152,0.845660,0.271919,0.944977,0.870876,0.703595,0.320304,0.000000,0.545207,0.006516,...,0.004954,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,0.0


## 3) Treinamento do KNN

In [209]:
k_neighburs = 5 
knn = NearestNeighbors(n_neighbors=k_neighburs, n_jobs=-1, metric="cosine")
knn.fit(X_imputado)

In [210]:
X_imputado.loc[160].values.reshape(1, -1)

array([[0.42467143, 0.71235213, 0.25755768, 0.74997969, 0.65324594,
        0.58740077, 0.53053811, 0.62497612, 0.60086833, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.01846579, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.24404082, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.11930207, 0.        , 0.        , 0.13028539,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

In [211]:
distance, neighbours_indices = knn.kneighbors(X_imputado, n_neighbors=k_neighburs+1)

In [212]:
distancias_corrigidas = distance[160][1:]
neighbours_indices_corrigidos = neighbours_indices[160][1:]

In [229]:
# Vizinhos lógicos de ITAPICURU:
colunas_avaliadoras = ["IBGE7","NOME"] + colunas_num + ["PRODUTO_MAIOR_VALOR"]
df_classificador.iloc[neighbours_indices_corrigidos][colunas_avaliadoras]

Unnamed: 0,IBGE7,NOME,LATITUDE,LONGITUDE,TRANSPORT_COST,PREC_MED,RED_MED,TEMP_MED,VEL_MED,ALTITUDE,AREA_IRRIGADA_TOT,PRODUTO_MAIOR_VALOR
146,2913705,INHAMBUPE,-11.781,-38.355,23560.574098,729.771173,1066.567575,25.173941,2.63464,337.79,1770.04332,MILHO (EM GRAO)
257,2926608,RIBEIRA DO POMBAL,-10.8373,-38.5382,24140.72656,778.182862,1034.162314,25.419589,2.172381,182.0,509.0,MILHO (EM GRAO)
256,2926509,RIBEIRA DO AMPARO,-11.0421,-38.4242,23986.17667,778.182862,1034.162314,25.419589,2.172381,182.0,1545.74325,MELAO
219,2922904,NOVA SOURE,-11.2329,-38.4871,23907.00769,778.182862,1034.162314,25.419589,2.172381,182.0,497.0,MELANCIA
105,2907905,CIPO,-11.1032,-38.5179,23988.945974,778.182862,1034.162314,25.419589,2.172381,182.0,219.0,TOMATE


In [232]:
df_classificador.loc[[160]][colunas_avaliadoras]

Unnamed: 0,IBGE7,NOME,LATITUDE,LONGITUDE,TRANSPORT_COST,PREC_MED,RED_MED,TEMP_MED,VEL_MED,ALTITUDE,AREA_IRRIGADA_TOT,PRODUTO_MAIOR_VALOR
160,2916500,ITAPICURU,-11.3088,-38.2262,23766.055025,778.182862,1034.162314,25.419589,2.172381,182.0,644.0,MILHO (EM GRAO)


In [225]:
# Produtos de ITAPICURO:
df_produto_agricola[df_produto_agricola["NOME"] == "ITAPICURU"]

Unnamed: 0,NOME,PRODUTO,AREA_PLANTADA,AREA_COLHIDA,REND_MEDIO,VALOR_PROD,IBGE7,REND_POR_AREA_PLANTADA
3498,ITAPICURU,BANANA,20.0,20.0,13433.333333,569.666667,2916500,671.666667
3499,ITAPICURU,CASTANHA DE CAJU,1295.666667,1295.666667,310.0,1535.0,2916500,0.239259
3500,ITAPICURU,FEIJAO (EM GRAO),1006.666667,1006.666667,270.666667,1241.333333,2916500,0.268874
3501,ITAPICURU,MANDIOCA,6266.666667,4133.333333,9092.666667,28589.0,2916500,1.450957
3502,ITAPICURU,MELANCIA,133.666667,133.666667,8848.333333,880.333333,2916500,66.197007
3503,ITAPICURU,MELAO,223.666667,223.666667,21234.666667,4674.666667,2916500,94.938897
3504,ITAPICURU,MILHO (EM GRAO),19333.333333,19333.333333,3689.0,70163.666667,2916500,0.19081
3505,ITAPICURU,SOJA (EM GRAO),51.666667,51.666667,2995.0,309.0,2916500,57.967742


In [231]:
# Solos em ITAPICURU:
df_solo[df_solo["IBGE7"].isin([2916500])]

Unnamed: 0,IBGE7,SOLO,AREA_TOTAL
3023,2916500,LAd - Latossolos Amarelos Distroficos,18032.365183
3024,2916500,PVAd - Argissolos Vermelho-Amarelos Distroficos,46579.916148
3025,2916500,RQo - Neossolos Quartzarenicos Orticos,86920.869936
3026,2916500,SNo - Planossolos Natricos Orticos,15514.030117


In [234]:
86920.869936/2501.373861

34.74925171771434

In [None]:
34.74925171771434

In [230]:
# Solos dos vizinhos lógicos:
df_solo[df_solo["IBGE7"].isin([2913705, 2926608, 2926509, 2922904, 2907905])]

Unnamed: 0,IBGE7,SOLO,AREA_TOTAL
1275,2913705,LAd - Latossolos Amarelos Distroficos,63582.129104
1276,2913705,PVAd - Argissolos Vermelho-Amarelos Distroficos,26682.523981
1277,2913705,PVAe - Argissolos Vermelho-Amarelos Eutroficos,39266.429269
1810,2907905,LAd - Latossolos Amarelos Distroficos,11816.751307
1811,2907905,RQo - Neossolos Quartzarenicos Orticos,2501.373861
2961,2926608,LAd - Latossolos Amarelos Distroficos,4236.562031
2962,2926608,PVAd - Argissolos Vermelho-Amarelos Distroficos,1455.611775
2963,2926608,RQo - Neossolos Quartzarenicos Orticos,75067.891352
3011,2922904,LAd - Latossolos Amarelos Distroficos,73498.034469
3012,2922904,PVAe - Argissolos Vermelho-Amarelos Eutroficos,25392.652704


In [217]:
# Calculo do rendimento por área:
df_produto_agricola["REND_POR_AREA_PLANTADA"] = df_produto_agricola["REND_MEDIO"] / df_produto_agricola["AREA_PLANTADA"]

In [218]:
# Rendimento por área dos vizinhos geográficos de ITAPICURU que possuem Eportação:
df_produto_agricola[df_produto_agricola["IBGE7"].isin([2807402, 2922904, 2916500])]

Unnamed: 0,NOME,PRODUTO,AREA_PLANTADA,AREA_COLHIDA,REND_MEDIO,VALOR_PROD,IBGE7,REND_POR_AREA_PLANTADA
3498,ITAPICURU,BANANA,20.0,20.0,13433.333333,569.666667,2916500,671.666667
3499,ITAPICURU,CASTANHA DE CAJU,1295.666667,1295.666667,310.0,1535.0,2916500,0.239259
3500,ITAPICURU,FEIJAO (EM GRAO),1006.666667,1006.666667,270.666667,1241.333333,2916500,0.268874
3501,ITAPICURU,MANDIOCA,6266.666667,4133.333333,9092.666667,28589.0,2916500,1.450957
3502,ITAPICURU,MELANCIA,133.666667,133.666667,8848.333333,880.333333,2916500,66.197007
3503,ITAPICURU,MELAO,223.666667,223.666667,21234.666667,4674.666667,2916500,94.938897
3504,ITAPICURU,MILHO (EM GRAO),19333.333333,19333.333333,3689.0,70163.666667,2916500,0.19081
3505,ITAPICURU,SOJA (EM GRAO),51.666667,51.666667,2995.0,309.0,2916500,57.967742
5073,NOVA SOURE,AMENDOIM (EM CASCA),40.0,40.0,1200.0,151.333333,2922904,30.0
5074,NOVA SOURE,BANANA,5.0,5.0,15200.0,161.0,2922904,3040.0


In [221]:
# Rendimento por área dos vizinhos lógicos de ITAPICURU:
df_produto_agricola[df_produto_agricola["IBGE7"].isin([2913705, 2926608, 2926509, 2922904, 2907905])]

Unnamed: 0,NOME,PRODUTO,AREA_PLANTADA,AREA_COLHIDA,REND_MEDIO,VALOR_PROD,IBGE7,REND_POR_AREA_PLANTADA
1952,CIPO,AMENDOIM (EM CASCA),60.0,60.0,539.0,101.0,2907905,8.983333
1953,CIPO,BANANA,15.333333,15.333333,8455.666667,213.333333,2907905,551.456522
1954,CIPO,CASTANHA DE CAJU,72.0,72.0,500.0,136.0,2907905,6.944444
1955,CIPO,FEIJAO (EM GRAO),131.333333,131.333333,99.0,52.0,2907905,0.753807
1956,CIPO,MANDIOCA,17.0,17.0,2941.333333,36.666667,2907905,173.019608
1957,CIPO,MANGA,4.0,4.0,2000.0,9.333333,2907905,500.0
1958,CIPO,MELANCIA,15.333333,15.333333,13933.333333,154.333333,2907905,908.695652
1959,CIPO,MILHO (EM GRAO),135.0,135.0,249.0,34.666667,2907905,1.844444
1960,CIPO,TOMATE,3.666667,3.666667,39361.0,371.333333,2907905,10734.818182
3214,INHAMBUPE,AMENDOIM (EM CASCA),55.0,55.0,1333.333333,272.333333,2913705,24.242424
