# Импорт библиотек

In [55]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Чтение датасета

In [56]:
df = pd.read_csv('data/AirQualityUCI.csv', sep=';')

In [57]:
df.sample(10)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
7084,30/12/2004,22.00.00,-200.0,882.0,-200.0,31.0,660.0,-200.0,991.0,-200.0,922.0,727.0,80.0,476.0,5113.0,,
7635,22/01/2005,21.00.00,27.0,1316.0,-200.0,211.0,1327.0,794.0,549.0,224.0,1328.0,1613.0,97.0,319.0,3848.0,,
7614,22/01/2005,00.00.00,26.0,1535.0,-200.0,201.0,1300.0,977.0,483.0,223.0,1524.0,1725.0,73.0,768.0,7904.0,,
718,09/04/2004,16.00.00,36.0,-200.0,357.0,-2000.0,-200.0,214.0,-200.0,139.0,-200.0,-200.0,-200.0,-200.0,-200.0,,
6758,17/12/2004,08.00.00,41.0,-200.0,-200.0,-2000.0,-200.0,854.0,-200.0,137.0,-200.0,-200.0,-200.0,-200.0,-200.0,,
5195,13/10/2004,05.00.00,-200.0,726.0,-200.0,7.0,466.0,-200.0,1485.0,-200.0,1066.0,408.0,143.0,584.0,9501.0,,
6449,04/12/2004,11.00.00,15.0,1015.0,-200.0,65.0,835.0,260.0,796.0,94.0,1263.0,911.0,124.0,801.0,11462.0,,
9414,,,,,,,,,,,,,,,,,
8825,13/03/2005,11.00.00,10.0,1283.0,-200.0,77.0,887.0,362.0,607.0,160.0,1238.0,1231.0,135.0,516.0,7931.0,,
3053,15/07/2004,23.00.00,19.0,1010.0,-200.0,108.0,1006.0,102.0,779.0,100.0,1589.0,1000.0,245.0,357.0,10807.0,,


# Исследование датасета

In [58]:
df.shape

(9471, 17)

In [59]:
df.isna().sum()

Date              114
Time              114
CO(GT)            114
PT08.S1(CO)       114
NMHC(GT)          114
C6H6(GT)          114
PT08.S2(NMHC)     114
NOx(GT)           114
PT08.S3(NOx)      114
NO2(GT)           114
PT08.S4(NO2)      114
PT08.S5(O3)       114
T                 114
RH                114
AH                114
Unnamed: 15      9471
Unnamed: 16      9471
dtype: int64

У нас есть два пустых поля. Удалим их

In [60]:
df.drop(['Unnamed: 15', 'Unnamed: 16'], axis=1, inplace=True)

In [61]:
df = df.loc[:9356].copy(deep=True)

In [62]:
df.describe(include='all')

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
count,9357,9357,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0,9357.0
unique,391,24,104.0,,,408.0,,,,,,,437.0,754.0,6684.0
top,21/09/2004,18.00.00,-200.0,,,-2000.0,,,,,,,-200.0,-200.0,-200.0
freq,24,390,1592.0,,,366.0,,,,,,,366.0,366.0,366.0
mean,,,,1048.990061,-159.090093,,894.595276,168.616971,794.990168,58.148873,1391.479641,975.072032,,,
std,,,,329.83271,139.789093,,342.333252,257.433866,321.993552,126.940455,467.210125,456.938184,,,
min,,,,-200.0,-200.0,,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,,,
25%,,,,921.0,-200.0,,711.0,50.0,637.0,53.0,1185.0,700.0,,,
50%,,,,1053.0,-200.0,,895.0,141.0,794.0,96.0,1446.0,942.0,,,
75%,,,,1221.0,-200.0,,1105.0,284.0,960.0,133.0,1662.0,1255.0,,,


In [63]:
df.dtypes

Date              object
Time              object
CO(GT)            object
PT08.S1(CO)      float64
NMHC(GT)         float64
C6H6(GT)          object
PT08.S2(NMHC)    float64
NOx(GT)          float64
PT08.S3(NOx)     float64
NO2(GT)          float64
PT08.S4(NO2)     float64
PT08.S5(O3)      float64
T                 object
RH                object
AH                object
dtype: object

Описание полей датасета
- `Date` (DD/MM/YYYY)
- `Time` (HH.MM.SS)
- True hourly averaged concentration `CO` in mg/m^3 (reference analyzer)
- `PT08.S1` (tin oxide) hourly averaged sensor response (nominally CO targeted)
- True hourly averaged overall `Non Metanic HydroCarbons` concentration in microg/m^3 (reference analyzer)
- True hourly averaged `Benzene` concentration in microg/m^3 (reference analyzer)
- `PT08.S2` (titania) hourly averaged sensor response (nominally NMHC targeted)
- True hourly averaged `NOx` concentration in ppb (reference analyzer)
- `PT08.S3` (tungsten oxide) hourly averaged sensor response (nominally NOx targeted)
- True hourly averaged `NO2` concentration in microg/m^3 (reference analyzer)
- `PT08.S4` (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted)
- `PT08.S5` (indium oxide) hourly averaged sensor response (nominally O3 targeted)
- `T` Temperature in Â°C
- `RH` Relative Humidity (%)
- `AH` Absolute Humidity

# Предобработка датасета

In [64]:
df['Date'] = pd.to_datetime(df['Date'], format="%d/%m/%Y")

In [70]:
df['Time'] = pd.to_datetime(df['Time'], format="%H.%M.%S").apply(lambda x: x.strftime("%H:%M:%S"))

# По сути в нашем датасете целевыми переменными могут высупать

In [6]:
df.sample(5)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
2381,17/06/2004,23.00.00,28,1177.0,-200.0,152,1156.0,147.0,684.0,120.0,1983.0,1310.0,246,458,13940,,
8465,26/02/2005,11.00.00,14,943.0,-200.0,36,688.0,179.0,924.0,116.0,909.0,629.0,96,435,5202,,
5138,10/10/2004,20.00.00,16,1105.0,-200.0,81,905.0,239.0,734.0,72.0,1576.0,950.0,212,677,16848,,
8452,25/02/2005,22.00.00,27,1207.0,-200.0,112,1023.0,473.0,629.0,188.0,1233.0,1373.0,62,624,5958,,
617,05/04/2004,11.00.00,3,1430.0,339.0,137,1107.0,228.0,713.0,121.0,1853.0,1409.0,166,552,10382,,


## Матрица корреляций

In [10]:
df.corr(numeric_only=True)

Unnamed: 0,PT08.S1(CO),NMHC(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3)
PT08.S1(CO),1.0,0.170007,0.933102,0.277993,0.087019,0.15403,0.845149,0.892434
NMHC(GT),0.170007,1.0,0.110104,-0.004427,0.048821,0.103307,0.16268,0.101185
PT08.S2(NMHC),0.933102,0.110104,1.0,0.331272,-0.073667,0.176488,0.874782,0.909905
NOx(GT),0.277993,-0.004427,0.331272,1.0,-0.436084,0.817139,0.035546,0.461889
PT08.S3(NOx),0.087019,0.048821,-0.073667,-0.436084,1.0,-0.256232,0.122734,-0.208865
NO2(GT),0.15403,0.103307,0.176488,0.817139,-0.256232,1.0,-0.022174,0.253439
PT08.S4(NO2),0.845149,0.16268,0.874782,0.035546,0.122734,-0.022174,1.0,0.72369
PT08.S5(O3),0.892434,0.101185,0.909905,0.461889,-0.208865,0.253439,0.72369,1.0
