In [2]:
import os
import sys
sys.path.append('..')

from src.constants import get_constants

import plotly.express as px
import pandas as pd
import numpy as np

from src.features.config import CYEConfigPreProcessor, CYEConfigTransformer
from src.features.preprocessing import CYEPreProcessor, CYETargetTransformer

from collections import Counter

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
# from tabgan.sampler import OriginalGenerator, GANGenerator, ForestDiffusionGenerator

In [5]:
cst = get_constants()
config = CYEConfigPreProcessor(fillna=True)
processor = CYEPreProcessor(config=config)
df_train = pd.read_csv(cst.file_data_train, index_col='ID')

X_train, y_train = df_train.drop(columns=cst.target_column), df_train[cst.target_column]
X_train = processor.fit_transform(X_train)

X_test = pd.read_csv(cst.file_data_test, index_col='ID')
X_test = processor.transform(X_test)

In [23]:
def get_classes(X, y):    
    lower_bound = 500
    upper_bound = 5000
    yield_by_acre = y / X['Acre']
    conditions = [yield_by_acre > upper_bound, yield_by_acre < lower_bound]
    classes = np.select(conditions, ['high', 'low'], default='middle')
    
    return yield_by_acre, classes
    
    
def plot_data(X, y, yield_by_acre, classes):
    fig = px.scatter(x=range(len(X)), y=yield_by_acre, color=classes, title='Ratio of Yield by Acre')
    fig.update_layout(
        xaxis={'title': 'Index'},
        yaxis={'title': 'Yield by Acre'},
    )
    fig.show()
    
    fig = px.scatter(x=X['Acre'], y=y, color=classes, title='Original Yield by Acre')
    fig.update_layout(
        xaxis={'title': 'Acre'},
        yaxis={'title': 'Yield'},
    )
    fig.show()
    
    print(Counter(classes))

In [28]:
X_concat = pd.concat([X_train, y_train], axis='columns')
yield_by_acre, classes = get_classes(X_train, y_train)
X_concat['Yield_comp'] = X_concat['Yield']
plot_data(X_concat, y_train, yield_by_acre, classes)

Counter({'middle': 3716, 'low': 133, 'high': 21})


In [29]:
import plotly.express as px

index = X_concat[classes == 'low'].index
X_concat.loc[index, 'Yield_comp'] = X_concat['Yield_comp'].loc[index] * 10
index = X_concat[classes == 'high'].index
X_concat.loc[index, 'Yield_comp'] = X_concat['Yield_comp'].loc[index] / 10
yield_by_acre_comp = X_concat['Yield_comp'] / X_concat['Acre']

fig = px.scatter(x=range(len(X_concat)), y=yield_by_acre_comp, color=classes, title='Ratio of Yield by Acre using a factor of 0.1 on High values and 10 on Low values')
fig.update_layout(
    xaxis={'title': 'Index'},
    yaxis={'title': 'Yield by Acre'},
)
fig.show()
fig = px.scatter(X_concat, x='Acre', y='Yield_comp', color=classes, title='Yield by Acre using a factor of 0.1 on High values and 10 on Low values')
fig.update_yaxes({'title': 'Yield using factors'}).show()

# px.scatter(X_concat, x='Acre', y='Yield_comp')

px.scatter(X_concat, x=X_concat.index, y='Yield', color=classes).show()

In [5]:
X_train_cls, y_train_cls = X_concat, classes
X_train_reg, y_train_reg = X_train, y_train

In [6]:
y_sm_cls.value_counts()[2]

NameError: name 'y_sm_cls' is not defined

In [None]:
y_sm_cls.value_counts()

In [None]:
sm = SMOTEENN(smote=SMOTE(k_neighbors=20))

X_sm_cls, y_sm_cls = sm.fit_resample(X_train_cls, y_train_cls)

X_sm_cls = X_sm_cls.sample(frac=1)
X_sm_cls, y_sm_cls = X_sm_cls.drop(columns=cst.target_column), X_sm_cls[cst.target_column]

yield_by_acre_cls, classes_cls = get_classes(X_sm_cls, y_sm_cls)
plot_data(X_sm_cls, y_sm_cls, yield_by_acre_cls, classes_cls)

In [None]:
generator = GANGenerator(gen_x_times=1.2)
new_X_train, new_y_train = generator.generate_data_pipe(X_train, pd.DataFrame(y_train), X_test)

In [None]:
yield_by_acre, classes = get_classes(new_X_train, new_y_train)
plot_data(new_X_train, new_y_train, yield_by_acre, classes)

In [None]:
_, classes = get_classes(X_train, y_train)

X_train_lh, y_train_lh = X_train[(classes == 'low') | (classes == 'high')], y_train[(classes == 'low') | (classes == 'high')]
X_train_middle, y_train_middle = X_train[classes == 'middle'], y_train[classes == 'middle']

generator = GANGenerator(gen_x_times=50)
new_X_train, new_y_train = generator.generate_data_pipe(X_train_lh, pd.DataFrame(y_train_lh), X_test)

In [None]:
X_train_gan = pd.concat([new_X_train, X_train_middle])
y_train_gan = pd.concat([new_y_train, y_train_middle])

yield_by_acre, classes = get_classes(X_train_gan, y_train_gan)
plot_data(X_train_gan, y_train_gan, yield_by_acre, classes)

In [None]:
def get_classes(X, y):    
    lower_bound = 500
    upper_bound = 5000
    yield_by_acre = y / X['Acre']
    conditions = [yield_by_acre > upper_bound, yield_by_acre < lower_bound]
    classes = np.select(conditions, ['high', 'low'], default='middle')
    
    return classes

In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)

X_train = X_train.reset_index(drop=True)
yield_train = y_train.reset_index(drop=True)
classes_train = get_classes(X_train, yield_train)

for train_idx, val_idx in skf.split(X_train, classes_train):    
    X_train_k = X_train_idx[X_train_idx.index.isin(train_idx)]
    y_train_k = y_train_idx[y_train_idx.index.isin(train_idx)]
    
    X_val_k = X_train_idx[X_train_idx.index.isin(val_idx)]
    y_val_k = y_train_idx[y_train_idx.index.isin(val_idx)]