In [1]:
import os
import sys
sys.path.append('..')

import pandas as pd
import numpy as np

import plotly.express as px
from plotly.subplots import make_subplots

import umap

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

from src.features.config import CYEConfigPreProcessor
from src.features.preprocessing import CYEDataPreProcessor

from src.constants import get_constants
cst = get_constants()

In [2]:
config = CYEConfigPreProcessor()
preprocessor = CYEDataPreProcessor(config=config)

df_train = pd.read_csv('../data/raw/Train.csv', index_col='ID')

X_train, y_train = df_train.drop(columns=cst.target_column), df_train[cst.target_column]
X_train = preprocessor.fit_transform(X_train)

In [4]:
yield_by_acre = y_train / X_train['Acre']

lower_bound = 500
upper_bound = 5000

conditions = [yield_by_acre > upper_bound, yield_by_acre < lower_bound]

choices = ['high', 'low']
classes = np.select(conditions, choices, default='middle')

In [5]:
fig = px.scatter(x=range(len(df_train)), y=yield_by_acre, color=classes)
fig.show()

In [6]:
fig = px.scatter(x=range(len(df_train)), y=y_train, color=classes)
fig.show()

In [7]:
fig = px.scatter(x=X_train['Acre'], y=y_train, color=classes)
fig.show()

In [8]:
X_train

Unnamed: 0_level_0,CultLand,CropCultLand,CropTillageDepth,SeedlingsPerPit,TransplantingIrrigationHours,TransIrriCost,StandingWater,Ganaura,NoFertilizerAppln,BasalDAP,...,PCropSolidOrgFertAppMethodRootApplication,PCropSolidOrgFertAppMethodSoilApplied,PCropSolidOrgFertAppMethodSpray,MineralFertAppMethodRootApplication,MineralFertAppMethodSoilApplied,MineralFertAppMethod.1RootApplication,MineralFertAppMethod.1SoilApplied,Threshing_methodmachine,Stubble_useplowed_in_soil,Threshing_dateYear2023
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ID_GTFAC7PEVWQ9,45,40,5,2.0,5.0,200.0,2.0,,2,,...,,,,0.0,0.0,0.0,0.0,1.0,1.0,0.0
ID_TK40ARLSPOKS,26,26,5,2.0,5.0,125.0,3.0,,2,15.0,...,,,,0.0,0.0,0.0,0.0,1.0,1.0,0.0
ID_1FJY2CRIMLZZ,10,10,6,2.0,4.0,80.0,2.0,1.0,2,4.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
ID_I3IPXS4DB7NE,15,15,6,2.0,,,,1.0,2,6.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
ID_4T8YQWXWHB4A,60,60,4,2.0,9.0,300.0,2.0,,2,15.0,...,,,,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ID_7ZZQ6R4XB4FK,28,28,6,,7.0,360.0,5.0,1.0,2,18.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
ID_PVVDF6LK6FO8,5,5,5,1.0,3.0,,3.0,,3,6.0,...,,,,1.0,0.0,0.0,0.0,0.0,1.0,0.0
ID_RBYVUPRATVMW,50,45,5,1.0,6.0,120.0,5.0,1.0,1,,...,0.0,0.0,0.0,0.0,0.0,,,0.0,1.0,0.0
ID_ARE9QWENJNJ2,20,20,3,4.0,2.0,,9.0,,1,7.0,...,,,,0.0,0.0,,,0.0,1.0,0.0


In [9]:
classes_num = pd.get_dummies(classes).values.argmax(axis=1)

In [10]:
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)

emb_y_train = umap.UMAP().fit_transform(X_train_imputed, y=classes_num)

fig = px.scatter(x=emb_y_train[:, 0], y=emb_y_train[:, 1], color=classes)
fig.show()

In [11]:
imputer = KNNImputer()
X_train_imputed = imputer.fit_transform(X_train)

emb_y_train = umap.UMAP().fit_transform(X_train_imputed, y=classes_num)

fig = px.scatter(x=emb_y_train[:, 0], y=emb_y_train[:, 1], color=classes)
fig.show()

In [12]:
imputer = IterativeImputer()
X_train_imputed = imputer.fit_transform(X_train)

emb_y_train = umap.UMAP().fit_transform(X_train_imputed, y=classes_num)

fig = px.scatter(x=emb_y_train[:, 0], y=emb_y_train[:, 1], color=classes)
fig.show()

In [13]:
X_high = X_train[classes == 'high']
X_middle = X_train[classes == 'middle']
X_low = X_train[classes == 'low']

In [14]:
# percentage of NaN values in df_high
X_high.isna().sum().sum() / (len(X_high.columns) * len(X_high)) * 100

2.591922845087402

In [15]:
# percentage of NaN values in df_middle
X_middle.isna().sum().sum() / (len(X_middle.columns) * len(X_middle)) * 100

7.0887438514259244

In [16]:
# percentage of NaN values in df_low
X_low.isna().sum().sum() / (len(X_low.columns) * len(X_low)) * 100

1.037403635671457