# Fáza 2 - Predspracovanie údajov

__Autori:__ Dávid Penťa, Samuel Bernát
__Percentuálny podiel práce:__ 50% / 50%

V tejto fáze sa od Vás očakáva že realizujte predspracovanie údajov pre strojové učenie. Výsledkom bude upravená dátová sada (csv alebo tsv), kde jedno pozorovanie je opísané jedným riadkom.
- scikit-learn vie len numerické dáta, takže treba niečo spraviť s nenumerickými dátami.
- Replikovateľnosť predspracovania na trénovacej a testovacej množine dát, aby ste mohli
zopakovať predspracovanie viackrát podľa Vašej potreby (iteratívne).

Keď sa predspracovaním mohol zmeniť tvar a charakteristiky dát, je možné že treba realizovať EDA opakovane podľa Vašej potreby. Bodovanie znovu (EDA) nebudeme, zmeny ale dokumentujte. Problém s dátami môžete riešiť iteratívne v každej fáze aj vo všetkých fázach podľa potreby.

In [103]:
import statsmodels.api as sm
import dateparser as dateparser
import matplotlib
import matplotlib.pylab as pylab
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.stats.api as sms
from scipy.stats import skew
from scipy.stats import kurtosis
from scipy.stats import pearsonr
import math
from matplotlib import pyplot
from numpy import exp
from numpy.random import randn
from sklearn.preprocessing import PowerTransformer
from operator import itemgetter
from sklearn.svm import SVC
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer

from sklearn.impute import  KNNImputer
from sklearn.compose import ColumnTransformer

pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [104]:
measurements_file = "data/measurements.csv"
measurements_data = pd.read_csv(measurements_file, sep='\t')

stations_file = "data/stations.csv"
stations_data = pd.read_csv(stations_file, sep='\t')

## Integrácia a čistenie dát
Transformujte dáta na vhodný formát pre strojové učenie t.j. jedno pozorovanie musí byť opísané jedným riadkom a každý atribút musí byť v numerickom formáte.
- Pri riešení chýbajúcich hodnôt (missing values) vyskúšajte rôzne stratégie ako napr.
    - odstránenie pozorovaní s chýbajúcimi údajmi
    - nahradenie chýbajúcej hodnoty mediánom, priemerom, pomerom (ku korelovanému atribútu), alebo pomocou lineárnej regresie resp. kNN
- Podobne postupujte aj pri riešení vychýlených hodnôt (outlier detection):
    - odstránenie vychýlených (odľahlých) pozorovaní
    - nahradenie vychýlenej hodnoty hraničnými hodnotami rozdelenia (5% resp. 95%)

In [105]:
measurements_data.dropna(inplace=True)
stations_data.dropna(inplace=True)

stations_data["QoS"] = np.where(stations_data["QoS"] == "accep", "acceptable", stations_data["QoS"])
stations_data["QoS"] = np.where(stations_data["QoS"] == "maitennce", "maintenance", stations_data["QoS"])

# stations_data['revision'] = stations_data['revision'].apply(lambda x: pd.Timestamp(x).strftime('%B-%d-%Y'))
# stations_data['revision_timestamp'] = stations_data['revision'].apply(lambda x: pd.Timestamp(x).timestamp())
stations_data['revision'] = stations_data['revision'].apply(lambda x: pd.Timestamp(x).timestamp())

stations_data['latitude'] = stations_data['latitude'].round(5)
stations_data['longitude'] = stations_data['longitude'].round(5)

stations_data["station"] = np.where(stations_data["station"] == "T‚Äôaebaek", "Taebaek", stations_data["station"])
stations_data["station"] = np.where(stations_data["station"] == "'Ali Sabieh", "Ali Sabieh", stations_data["station"])
stations_data["station"] = np.where(stations_data["station"] == "Oktyabr‚Äôskiy", "Oktyabrsk", stations_data["station"])
stations_data["station"] = np.where(stations_data["station"] == "Roslavl‚Äô", "Roslavl", stations_data["station"])
stations_data["station"] = np.where(stations_data["station"] == "Dyat‚Äôkovo", "Dyatkovo", stations_data["station"])

stations_data

Unnamed: 0,QoS,station,code,latitude,longitude,revision
0,good,Casa Blanca,MX,19.042,-98.119,1574812800.000
1,building,Mikhaylovka,RU,50.060,43.238,1501027200.000
2,building,Shahre Jadide Andisheh,IR,35.680,51.019,1370044800.000
3,building,Aracaju,BR,-10.911,-37.072,1356739200.000
4,maintenance,Parola,IN,20.881,75.119,1562284800.000
...,...,...,...,...,...,...
1105,building,Tadmur,SY,34.562,38.284,1373414400.000
1106,average,Jizzax,UZ,40.116,67.842,1369872000.000
1107,acceptable,West Chester,US,39.961,-75.608,1351987200.000
1108,acceptable,Oktyabrsk,RU,54.481,53.471,1429315200.000


In [106]:
stations_data = stations_data.groupby(by='station').apply(lambda x: x.loc[x['revision']==x['revision'].max()])

## Spojenie tabuliek

In [87]:
df=pd.merge(stations_data, measurements_data, on=['latitude', 'longitude'], how='inner')

df

Unnamed: 0,QoS,station,code,latitude,longitude,revision,PM10,CO,Pb,C2H3NO5,...,O3,TEMP,NOx,SO2,NH3,CH4,PRES,PM2.5,warning,PAHs
0,maintenance,Aberdeen,GB,57.144,-2.098,1545523200.000,8.940,8.154,51.710,1.070,...,7.592,6.608,7.424,8.528,10.371,8.041,1113.510,8.454,0.000,7.478
1,maintenance,Aberdeen,GB,57.144,-2.098,1545523200.000,9.927,7.481,50.207,2.649,...,7.631,6.164,8.415,8.105,8.641,5.756,1141.891,8.989,0.000,5.313
2,maintenance,Aberdeen,GB,57.144,-2.098,1545523200.000,7.709,7.947,30.489,0.466,...,9.905,7.456,6.882,8.671,7.164,8.022,1141.321,8.993,0.000,5.606
3,maintenance,Aberdeen,GB,57.144,-2.098,1545523200.000,8.793,7.957,41.969,0.918,...,7.433,11.979,9.633,8.683,8.311,8.639,1051.385,10.077,0.000,6.808
4,maintenance,Aberdeen,GB,57.144,-2.098,1545523200.000,8.198,7.872,25.720,0.594,...,5.302,2.991,8.131,9.956,5.980,6.373,1153.656,9.255,0.000,7.144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11339,excellent,les Escaldes,AD,42.507,1.534,1418947200.000,8.894,7.757,43.951,1.056,...,7.899,13.318,7.973,6.952,9.157,6.947,1096.632,7.715,1.000,9.123
11340,excellent,les Escaldes,AD,42.507,1.534,1418947200.000,7.152,6.648,56.672,0.268,...,5.927,2.833,8.147,5.883,4.177,6.400,1181.067,10.331,0.000,8.772
11341,excellent,les Escaldes,AD,42.507,1.534,1418947200.000,8.657,5.704,61.589,0.788,...,9.562,4.477,6.676,8.315,9.233,7.915,1077.811,9.005,0.000,8.225
11342,excellent,les Escaldes,AD,42.507,1.534,1418947200.000,6.270,4.976,49.210,0.258,...,6.137,3.798,8.252,9.356,4.622,6.273,1081.732,9.740,1.000,9.263


### Opis každého atribútu numerickým formátom

In [88]:
df.loc[df.QoS == "excellent", "QoS"] = "1"
df.loc[df.QoS == "good", "QoS"] = "2"
df.loc[df.QoS == "average", "QoS"] = "3"
df.loc[df.QoS == "acceptable", "QoS"] = "4"
df.loc[df.QoS == "building", "QoS"] = "5"
df.loc[df.QoS == "maintenance", "QoS"] = "6"

df[['QoS']] = df[['QoS']].apply(pd.to_numeric)

In [89]:
le = LabelEncoder()
le.fit(df['station'])
df['station'] = le.transform(df['station'])

le.fit(df['code'])
df['code'] = le.transform(df['code'])

df

Unnamed: 0,QoS,station,code,latitude,longitude,revision,PM10,CO,Pb,C2H3NO5,...,O3,TEMP,NOx,SO2,NH3,CH4,PRES,PM2.5,warning,PAHs
0,6,0,37,57.144,-2.098,1545523200.000,8.940,8.154,51.710,1.070,...,7.592,6.608,7.424,8.528,10.371,8.041,1113.510,8.454,0.000,7.478
1,6,0,37,57.144,-2.098,1545523200.000,9.927,7.481,50.207,2.649,...,7.631,6.164,8.415,8.105,8.641,5.756,1141.891,8.989,0.000,5.313
2,6,0,37,57.144,-2.098,1545523200.000,7.709,7.947,30.489,0.466,...,9.905,7.456,6.882,8.671,7.164,8.022,1141.321,8.993,0.000,5.606
3,6,0,37,57.144,-2.098,1545523200.000,8.793,7.957,41.969,0.918,...,7.433,11.979,9.633,8.683,8.311,8.639,1051.385,10.077,0.000,6.808
4,6,0,37,57.144,-2.098,1545523200.000,8.198,7.872,25.720,0.594,...,5.302,2.991,8.131,9.956,5.980,6.373,1153.656,9.255,0.000,7.144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11339,1,660,0,42.507,1.534,1418947200.000,8.894,7.757,43.951,1.056,...,7.899,13.318,7.973,6.952,9.157,6.947,1096.632,7.715,1.000,9.123
11340,1,660,0,42.507,1.534,1418947200.000,7.152,6.648,56.672,0.268,...,5.927,2.833,8.147,5.883,4.177,6.400,1181.067,10.331,0.000,8.772
11341,1,660,0,42.507,1.534,1418947200.000,8.657,5.704,61.589,0.788,...,9.562,4.477,6.676,8.315,9.233,7.915,1077.811,9.005,0.000,8.225
11342,1,660,0,42.507,1.534,1418947200.000,6.270,4.976,49.210,0.258,...,6.137,3.798,8.252,9.356,4.622,6.273,1081.732,9.740,1.000,9.263


### Uloženie upravenej dátovej sady do .csv súboru

In [90]:
# output = df[['station_ID', 'code_ID', 'QoS_ID', 'warning', 'latitude', 'longitude', 'revision','PAHs', 'PM10', 'CO', 'Pb', 'C2H3NO5', 'CFCs', 'H2CO', 'O3', 'TEMP', 'NOx', 'SO2', 'NH3', 'CH4', 'PRES', 'PM2.5']]
#
# output.to_csv('output.csv', index=False)

### Power Transformer

In [91]:
def power_transform(column_name):

    sns.histplot(data=df, hue='warning', x=column_name, fill=True, kde=True)
    plt.show()

    data = df[column_name].values
    data = data.reshape((len(data),1))


    data_trans = PowerTransformer(method='yeo-johnson', standardize=True).fit_transform(data)
    df[column_name] = data_trans

    sns.histplot(data=df, hue='warning', x=column_name, fill=True, kde=True)
    plt.show()

def transform(column_name):

    sns.histplot(data=df, hue='warning', x=column_name, fill=True, kde=True)
    plt.show()

    data = df[column_name].values
    data = data.reshape((len(data),1))

    power = StandardScaler()
    data_trans = power.fit_transform(data)
    df[column_name] = data_trans

    sns.histplot(data=df, hue='warning', x=column_name, fill=True, kde=True)
    plt.show()

In [92]:
# power_transform('C2H3NO5')
#
# variables_arr = ['PAHs', 'PM10', 'CO', 'Pb', 'C2H3NO5', 'CFCs', 'H2CO', 'O3', 'TEMP', 'NOx', 'SO2', 'NH3', 'CH4', 'PRES', 'PM2.5']
#
# for i in variables_arr:
#     print(i)
#     transform(i)

#
# def power(data):
#     variables_arr = ['PAHs', 'PM10', 'CO', 'Pb', 'C2H3NO5', 'CFCs', 'H2CO', 'O3', 'TEMP', 'NOx', 'SO2', 'NH3', 'CH4', 'PRES', 'PM2.5']
#
#     for i in variables_arr:
#         # print(i)
#         # transform(i)
#         if abs(data[i].skew()) > 1.5:
#             power_transform(i)

### Zoradenie podla p a zaroven r

In [93]:
arr = []

for i in list(df.columns.values):
    if i != 'QoS' and i != 'station' and i != 'code':
        (r, p) = pearsonr(df['warning'], df[i])
        if p < 0.05 and i != 'warning':
            arr.append([i, abs(r), p])

arr = sorted(arr, key=itemgetter(1))
a = []
for i in range(len(arr)):
    a.append([i + 1, arr[len(arr) - i - 1][0], arr[len(arr) - i - 1][1], arr[len(arr) - i - 1][2]])

for row in a:
    print("{: >0} {: >10} {:.5f} {:.5f}".format(*row))

1       PAHs 0.63745 0.00000
2       PM10 0.38176 0.00000
3        NH3 0.33896 0.00000
4        CH4 0.33534 0.00000
5    C2H3NO5 0.11375 0.00000
6      PM2.5 0.02611 0.00542
7       H2CO 0.01939 0.03889


In [94]:
def transforming_pipe(X):
    column_names = ['QoS', 'station', 'code', 'latitude', 'longitude', 'revision', 'PAHs', 'PM10', 'CO', 'Pb', 'C2H3NO5', 'CFCs', 'H2CO', 'O3', 'TEMP', 'NOx', 'SO2', 'NH3', 'CH4', 'PRES', 'PM2.5']
    variables = ['PAHs', 'PM10', 'CO', 'Pb', 'C2H3NO5', 'CFCs', 'H2CO', 'O3', 'TEMP', 'NOx', 'SO2', 'NH3', 'CH4', 'PRES', 'PM2.5']
    passthroughs = ['QoS', 'station', 'code', 'latitude', 'longitude', 'revision']

# , y = None
    # if y is not None:
    #     X['warning'] = y
    #     passthroughs.insert(0, 'warning')
    #     column_names.insert(0, 'warning')

    pipe_vars = make_pipeline(
                           # KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean'),
                           PowerTransformer(method='yeo-johnson', standardize=True),
                           Normalizer(),
                           StandardScaler())
    ct = ColumnTransformer([('passthrough', 'passthrough', passthroughs), ('num_transformer', pipe_vars, variables)])

    pipeline = Pipeline([('column_transformer', ct)])

    pipeline.fit(X)
    transformed_df = pd.DataFrame(pipeline.transform(X), columns=column_names).copy()

    return transformed_df

In [95]:
def clean_and_merge(station_data, measurement_data):

    # TODO: add KNN - Nan

    measurement_data.dropna(inplace=True)
    station_data.dropna(inplace=True)

    station_data["QoS"] = np.where(station_data["QoS"] == "accep", "acceptable", station_data["QoS"])
    station_data["QoS"] = np.where(station_data["QoS"] == "maitennce", "maintenance", station_data["QoS"])
    station_data['revision'] = station_data['revision'].apply(lambda x: pd.Timestamp(x).timestamp())
    station_data['latitude'] = station_data['latitude'].round(5)
    station_data['longitude'] = station_data['longitude'].round(5)
    station_data["station"] = np.where(station_data["station"] == "T‚Äôaebaek", "Taebaek", station_data["station"])
    station_data["station"] = np.where(station_data["station"] == "'Ali Sabieh", "Ali Sabieh", station_data["station"])
    station_data["station"] = np.where(station_data["station"] == "Oktyabr‚Äôskiy", "Oktyabrsk", station_data["station"])
    station_data["station"] = np.where(station_data["station"] == "Roslavl‚Äô", "Roslavl", station_data["station"])
    station_data["station"] = np.where(station_data["station"] == "Dyat‚Äôkovo", "Dyatkovo", station_data["station"])

    station_data = station_data.groupby(by='station').apply(lambda x: x.loc[x['revision']==x['revision'].max()])

    station_data.loc[station_data.QoS == "excellent", "QoS"] = "1"
    station_data.loc[station_data.QoS == "good", "QoS"] = "2"
    station_data.loc[station_data.QoS == "average", "QoS"] = "3"
    station_data.loc[station_data.QoS == "acceptable", "QoS"] = "4"
    station_data.loc[station_data.QoS == "building", "QoS"] = "5"
    station_data.loc[station_data.QoS == "maintenance", "QoS"] = "6"
    station_data[['QoS']] = station_data[['QoS']].apply(pd.to_numeric)

    le = LabelEncoder()
    le.fit(station_data['station'])
    station_data['station'] = le.transform(station_data['station'])

    le.fit(station_data['code'])
    station_data['code'] = le.transform(station_data['code'])

    merged_data = pd.merge(station_data, measurement_data, on=['latitude', 'longitude'], how='inner')

    return merged_data

In [96]:
def chose_informative_attributes_for_ml(X, y):
    info_attributes = []
    for column_name in list(X.columns.values):
        (r, p) = pearsonr(y, X[column_name])
        if p < 0.05:
            info_attributes.append([column_name, abs(r), p])

    array = sorted(info_attributes, key=itemgetter(1))

    sorted_info_attributes = []
    for i in range(len(array)):
        sorted_info_attributes.append([i + 1, array[len(array) - i - 1][0], array[len(array) - i - 1][1], array[len(array) - i - 1][2]])

    for row in sorted_info_attributes:
        print("{: >0} {: >10} {:.5f} {:.5f}".format(*row))



In [97]:
measurements_data2 = pd.read_csv(measurements_file, sep='\t')
stations_data2 = pd.read_csv(stations_file, sep='\t')

df3 = clean_and_merge(stations_data2, measurements_data2)

column_names = ['QoS', 'station', 'code', 'latitude', 'longitude', 'revision', 'PAHs', 'PM10', 'CO', 'Pb', 'C2H3NO5', 'CFCs', 'H2CO', 'O3', 'TEMP', 'NOx', 'SO2', 'NH3', 'CH4', 'PRES', 'PM2.5']

X = df3[column_names]
y = df3['warning']

X_train, X_test, y_train, y_test = train_test_split(X, y)

d = transforming_pipe(X)
chose_informative_attributes_for_ml(d, y)

1       PAHs 0.66587 0.00000
2       PM10 0.40445 0.00000
3    C2H3NO5 0.37714 0.00000
4        NH3 0.34200 0.00000
5        CH4 0.32998 0.00000
6      PM2.5 0.03169 0.00074
7         O3 0.02062 0.02806
8       H2CO 0.01971 0.03583


In [98]:
# def all_displots(data):
#     for column_name in list(data.columns.values):
#         sns.displot(data[column_name], bins=20, kde=True)

In [99]:
# all_displots(d)

In [100]:
d

Unnamed: 0,QoS,station,code,latitude,longitude,revision,PAHs,PM10,CO,Pb,...,CFCs,H2CO,O3,TEMP,NOx,SO2,NH3,CH4,PRES,PM2.5
0,6.000,0.000,37.000,57.144,-2.098,1545523200.000,-0.272,0.413,0.354,0.734,...,2.867,-1.047,-0.082,-0.808,-0.329,0.712,1.624,0.568,-0.248,-0.492
1,6.000,0.000,37.000,57.144,-2.098,1545523200.000,-1.904,1.428,-0.391,0.857,...,0.464,-1.240,-0.094,-1.177,0.852,0.428,0.997,-0.974,0.485,-0.086
2,6.000,0.000,37.000,57.144,-2.098,1545523200.000,-1.747,-0.510,0.224,-1.267,...,-1.194,-0.927,1.494,-1.036,-1.202,1.207,-0.080,0.805,0.481,-0.084
3,6.000,0.000,37.000,57.144,-2.098,1545523200.000,-0.785,0.398,0.212,-0.014,...,-0.408,-0.571,-0.207,-0.385,2.267,1.091,0.693,1.131,-1.985,1.031
4,6.000,0.000,37.000,57.144,-2.098,1545523200.000,-0.492,-0.058,0.097,-1.390,...,1.361,-0.784,-1.307,-1.270,0.381,2.305,-0.755,-0.389,0.660,0.170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11339,1.000,660.000,0.000,42.507,1.534,1418947200.000,0.882,0.577,-0.033,0.217,...,2.350,-0.611,0.095,-0.275,0.298,-1.212,1.536,-0.032,-0.935,-1.696
11340,1.000,660.000,0.000,42.507,1.534,1418947200.000,0.388,-0.672,-1.052,1.094,...,-0.835,-0.248,-0.868,-1.143,0.354,-1.751,-1.610,-0.334,1.149,1.000
11341,1.000,660.000,0.000,42.507,1.534,1418947200.000,0.123,0.261,-2.209,1.672,...,0.222,-0.274,0.993,-1.133,-1.168,0.572,1.156,0.566,-1.127,-0.054
11342,1.000,660.000,0.000,42.507,1.534,1418947200.000,0.589,-1.086,-2.400,0.490,...,0.644,-0.187,-0.706,-0.968,0.416,1.359,-1.266,-0.375,-0.831,0.492


In [101]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

column_names = ['QoS', 'station', 'code', 'latitude', 'longitude', 'revision', 'PAHs', 'PM10', 'CO', 'Pb', 'C2H3NO5', 'CFCs', 'H2CO', 'O3', 'TEMP', 'NOx', 'SO2', 'NH3', 'CH4', 'PRES', 'PM2.5']
# X, y = d[column_names], d['warning']

# X = d[column_names]
# y = d['warning']

# X_train, X_test, y_train, y_test
# gamma=0.001, C=100., random_state=0
classif = OneVsRestClassifier(estimator=LinearRegression())
classif.fit(X_train, y_train) #.predict(X_train)

# X_train, X_test, y_train, y_test
# print(X_train)

classif.fit(X_train, y_train)

# display(classif.predict(X_test))
display(classif.score(X_test, y_test))
# display(X_train)
# display(X_test)
# display(y_train)
# display(y_test)
# d['warning'].value_counts()

# 0.9245416078984485

0.9255994358251057

In [102]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

NameError: name 'SimpleImputer' is not defined