In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from feature_engine.imputation import MeanMedianImputer

# Load dataset
data = pd.read_csv('/Volumes/GoogleDrive/.shortcut-targets-by-id/1O_LCvBhBN7-B1UdZ58UM2h6cqnW4ZumD/JupyterNB-JC/00-data/timeseries_data-Temperature-Salinity.csv')

In [18]:
data.columns

Index(['id', 'obs_id', 'year_month', 'temperatureSurface',
       'temperature100_300', 'temperature300_400', 'temperature100_500',
       'temperatureMaxDepth', 'salinitySurface', 'salinity100_300',
       'salinity300_400', 'salinity100_500', 'salinityMaxDepth'],
      dtype='object')

In [None]:
# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['salinitySurface', 'salinityMaxDepth'], axis=1),
    data['id'],
    test_size=0.3,
    random_state=0
)

# set up the imputer
median_imputer = MeanMedianImputer(
    imputation_method='median', variables=['temperature100_300', 'temperatureSurface']
    )

# fit the imputer
median_imputer.fit(X_train)

# transform the data
train_t = median_imputer.transform(X_train)
test_t = median_imputer.transform(X_test)

fig = plt.figure()
ax = fig.add_subplot(111)
X_train['id'].plot(kind='kde', ax=ax)
train_t['id'].plot(kind='kde', ax=ax, color='red')
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')