In [25]:
import copy
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import pearsonr
from operator import itemgetter
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import  KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler ,MinMaxScaler, PowerTransformer,QuantileTransformer

In [26]:
def merge(station_data, measurement_data):
    station_data["QoS"] = np.where(station_data["QoS"] == "accep", "acceptable", station_data["QoS"])
    station_data["QoS"] = np.where(station_data["QoS"] == "maitennce", "maintenance", station_data["QoS"])

    station_data['revision'] = station_data['revision'].apply(lambda x: pd.Timestamp(x).timestamp())

    station_data['latitude'] = station_data['latitude'].round(5)
    station_data['longitude'] = station_data['longitude'].round(5)

    station_data["station"] = np.where(station_data["station"] == "T‚Äôaebaek", "Taebaek", station_data["station"])
    station_data["station"] = np.where(station_data["station"] == "'Ali Sabieh", "Ali Sabieh", station_data["station"])
    station_data["station"] = np.where(station_data["station"] == "Oktyabr‚Äôskiy", "Oktyabrsk", station_data["station"])
    station_data["station"] = np.where(station_data["station"] == "Roslavl‚Äô", "Roslavl", station_data["station"])
    station_data["station"] = np.where(station_data["station"] == "Dyat‚Äôkovo", "Dyatkovo", station_data["station"])

    station_data = station_data.groupby(by=['latitude', 'longitude'], group_keys=False).apply(lambda x: x.loc[x['revision']==x['revision'].max()])
    result_table = pd.merge(station_data, measurement_data, on=['latitude', 'longitude'], how='inner')

    return result_table

In [27]:
def split(data):
    column_names_ = []
    for column_name_ in list(data.columns.values):
        column_names_.append(column_name_)

    column_names_.remove('warning')

    X_train, X_test, y_train, y_test = train_test_split(data[column_names_], data['warning'])

    return X_train, X_test, y_train, y_test

In [28]:
def power_transform(X, column_name):
    data = X[column_name].values
    data = data.reshape((len(data),1))

    data_trans = PowerTransformer(method='yeo-johnson', standardize=True).fit_transform(data)
    X[column_name] = data_trans

In [29]:
def transform(X, y=None, pipeline=None):
    p = 1
    if pipeline is None:
        p = 0
    le = LabelEncoder()
    le.fit(X['station'])
    X['station'] = le.transform(X['station'])

    le.fit(X['code'])
    X['code'] = le.transform(X['code'])

    le.fit(X['QoS'])
    X['QoS'] = le.transform(X['QoS'])

    column_names_t = []

    for column_name_t in list(X.columns.values):
        column_names_t.append(column_name_t)

    chemical_names_t = copy.deepcopy(column_names_t)

    passthroughs_t = ['QoS', 'station', 'code', 'latitude', 'longitude', 'revision']

    for passthrough in passthroughs_t:
        chemical_names_t.remove(passthrough)

    if y is not None: # ak y nie je None potom mame trenovacie data
        column_names_t.append('warning')
        passthroughs_t.append('warning')
        X['warning'] = y
        X.dropna(subset=['warning'], inplace=True) # trenovacie data bez warning mozeme vyhodit


    column_names_t = passthroughs_t + chemical_names_t

    for i in chemical_names_t:
        if abs(X[i].skew()) > 1.5:
            power_transform(X, i)

    if p == 0:
        pipe = make_pipeline(
            KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean'),
            QuantileTransformer(output_distribution="normal", random_state=0, n_quantiles=1000),
            StandardScaler()
        )

        ct = ColumnTransformer([('passthrough', 'passthrough', passthroughs_t), ('num_transformer', pipe, chemical_names_t)])

        pipeline = Pipeline([('column_transformer', ct)])

    pipeline.fit(X)
    transformed_df = pd.DataFrame(pipeline.transform(X), columns=column_names_t).copy()

    X.to_csv('output.csv', index=False)

    if y is None:
        if p == 0:
            return transformed_df, pipeline
        if p == 1:
            return transformed_df
    else:
        column_names_t.remove('warning')
        if p == 0:
            return transformed_df[column_names_t], transformed_df['warning'], pipeline
        if p == 1:
            return transformed_df[column_names_t], transformed_df['warning']

In [30]:
d = merge(pd.read_csv("data/stations.csv", sep='\t'), pd.read_csv("data/measurements.csv", sep='\t'))
X_train, X_test, y_train, y_test = split(d)

X_train, y_train, pipeline_ = transform(X_train, y_train)

X_train.to_csv('transformed_data/X_train.csv', index=False)
y_train.to_csv('transformed_data/y_train.csv', index=False)

X_test, y_test = transform(X_test, y_test, pipeline_)

X_test.to_csv('transformed_data/X_test.csv', index=False)
y_test.to_csv('transformed_data/y_test.csv', index=False)