In [None]:
import os
os.getcwd()

In [None]:
cd ..

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from src.preprocess_utils import drop_outliers, scale_variables
from src.constants import TARGETS, SD, ID
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr, zscore, normaltest
import matplotlib
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
def magnitude(x):
    return np.sign(x)* 10**int(np.log10(np.abs(x)))

In [None]:
train_df = pd.read_csv('data/planttraits2024/train.csv')
test_df = pd.read_csv('data/planttraits2024/test.csv')

In [None]:
variables = train_df.columns[~train_df.columns.isin(TARGETS+SD+[ID])]

In [None]:
meta_variables = variables.str.split('_').str[:2].str.join("_").unique()

In [None]:
target_name_meta = pd.read_csv('data/planttraits2024/target_name_meta.tsv', delimiter='\t')
target_name_meta['trait_ID'] = target_name_meta['trait_ID'] + '_mean'
target_name_meta = target_name_meta.set_index('trait_ID').squeeze().to_dict()

In [None]:
train_df[TARGETS].apply(np.log)

In [None]:
train_df[TARGETS].apply(normaltest, axis=0)

In [None]:
train_df[TARGETS].apply(np.log).apply(normaltest, axis=0)

In [None]:
train_zscore = train_df.apply(zscore, axis=0)
train_df

In [None]:
target_name_meta

In [None]:
train_df[train_df.X4_mean < 0].to_csv('csv.csv')

In [None]:
magnitude_counts = train_df[TARGETS].map(magnitude, na_action='ignore').apply(pd.value_counts)
magnitude_counts


In [None]:
magnitude_counts.map(np.log)

In [None]:
magnitude_ranges = {"X4_mean": (1e-5, 1e1),
                    "X11_mean": (1e-1, 1e2),
                    "X18_mean": (1e-2, 1e2),
                    "X26_mean": (1e-2, 1e4),
                    "X50_mean": (1e-1, 1e2),
                    "X3112_mean": (1e0, 1e5)
                    }
magnitude_mask = train_df[TARGETS].apply(lambda x: (x > magnitude_ranges[x.name][0]) & (x < magnitude_ranges[x.name][1]), axis=0)

In [50]:
(~magnitude_mask).sum(axis=1).value_counts()
wo_magnitude_outliers = train_df[~(~magnitude_mask).any(axis=1)]

In [None]:
matplotlib.rcParams['figure.figsize'] = [20, 8]

for target in TARGETS:
    fig, ax = plt.subplots(1, 2)
    fig.suptitle(target)
    # ax.set_yscale('log')
    magnitude_counts[target].plot(kind='bar', ax=ax[0])
    magnitude_counts[target].apply(np.log).plot(kind='bar', ax=ax[1])
    plt.tight_layout()

In [None]:
new_df = train_df[train_df.X4_mean<0]
new_df['num outliers'] = (train_zscore[train_df.X4_mean<0].abs()>2.5).sum(axis=1)
new_df.to_csv('csv.csv')

In [None]:


z = 3
wo_outliers_train_df, outliers = drop_outliers(train_df, z)

In [None]:
wo_outliers_train_df[TARGETS].apply(np.log).apply(normaltest, axis=0)

In [None]:
bins = 250

In [None]:
target = 

fig, ax = plt.subplots(5, 2)
fig.suptitle(target)
# ax.set_yscale('log')
train_df[target].hist(ax=ax[0,0], bins=bins)
train_df[target].apply(np.log).hist(ax=ax[0, 1], bins=bins)
   

# ax.set_yscale('log')
train_df[target].hist(ax=ax[1, 0], bins=bins, log=True)
train_df[target].apply(np.log).hist(ax=ax[1, 1], bins=bins, log=True)


# ax.set_yscale('log')
wo_outliers_train_df[target].hist(ax=ax[2, 0], bins=bins)
wo_outliers_train_df[target].apply(np.log).hist(ax=ax[2, 1],bins=bins)


# ax.set_yscale('log')
wo_outliers_train_df[target].hist(ax=ax[3, 0], bins=bins, log=True)
wo_outliers_train_df[target].apply(np.log).hist(ax=ax[3, 1],bins=bins, log=True)



# ax.set_yscale('log')
wo_outliers_train_df[target].hist(ax=ax[4, 0], bins=bins, log=True)
wo_outliers_train_df[target].apply(np.log).hist(ax=ax[4, 1],bins=bins, log=True)


wo_magnitude_outliers[target].hist(ax=ax[5, 0], bins=bins, log=True)
wo_magnitude_outliers[target].apply(np.log).hist(ax=ax[5, 1],bins=bins, log=True)


In [None]:
for target in TARGETS:
    fig, ax = plt.subplots(1, 2)
    fig.suptitle(target)
    # ax.set_yscale('log')
    train_df[target].hist(ax=ax[0], bins=bins)
    train_df[target].apply(np.log).hist(ax=ax[1], bins=bins)
    plt.tight_layout()
    

In [None]:
for target in TARGETS:
    fig, ax = plt.subplots(1, 2)
    fig.suptitle(target)
    # ax.set_yscale('log')
    train_df[target].hist(ax=ax[0], bins=bins, log=True)
    train_df[target].apply(np.log).hist(ax=ax[1], bins=bins, log=True)
    plt.tight_layout()



In [None]:
for target in TARGETS:
    fig, ax = plt.subplots(1, 2)
    fig.suptitle(target)
    # ax.set_yscale('log')
    wo_outliers_train_df[target].hist(ax=ax[0], bins=bins)
    wo_outliers_train_df[target].apply(np.log).hist(ax=ax[1],bins=bins)
    plt.tight_layout()

In [None]:
for target in TARGETS:
    fig, ax = plt.subplots(1, 2)
    fig.suptitle(target)
    # ax.set_yscale('log')
    wo_outliers_train_df[target].hist(ax=ax[0], bins=bins, log=True)
    wo_outliers_train_df[target].apply(np.log).hist(ax=ax[1],bins=bins, log=True)
    plt.tight_layout()