In [281]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

# Make the graphs a bit prettier, and bigger
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (15, 5)

# This is necessary to show lots of columns in pandas 0.12. 
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

# Weather data

In [282]:
data_frame = pd.read_csv("MATF_Hackathon_2021/BA_2012-2021.csv", sep=";")

clean_data = pd.DataFrame()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [283]:
data_frame["T"].head()

0    11.9
1    12.9
2    14.3
3    15.3
4    17.5
Name: T, dtype: float64

In [284]:
data_frame.iloc[0]

DateTime                         31.03.2021 23:00
T                                            11.9
Po                                         1010.5
P                                          1022.5
Pa                                           -0.5
U                                              67
DD          Wind blowing from the north-northwest
Ff                                              1
ff10                                          NaN
ff3                                           NaN
N                                       no clouds
WW                                               
W1                                            NaN
W2                                            NaN
Tn                                            NaN
Tx                                            NaN
Cl                                            NaN
Nh                                            NaN
H                                             NaN
Cm                                            NaN


### Parse DateTime

In [285]:
def datetime2ymd(in_df, out_df):
    out_df["year"]  = in_df["DateTime"].astype(str).str[:4].astype(int)
    out_df["month"] = in_df["DateTime"].astype(str).str[5:7].astype(int)
    out_df["day"]   = in_df["DateTime"].astype(str).str[8:10].astype(int)
    out_df["hour"]  = in_df["DateTime"].astype(str).str[11:13].astype(int)
    out_df["min"]   = in_df["DateTime"].astype(str).str[14:].astype(int)
    
def datetime2dmy(in_df, out_df):
    out_df["day"]   = in_df["DateTime"].astype(str).str[:2].astype(int)
    out_df["month"] = in_df["DateTime"].astype(str).str[3:5].astype(int)
    out_df["year"]  = in_df["DateTime"].astype(str).str[6:10].astype(int)
    out_df["hour"]  = in_df["DateTime"].astype(str).str[11:13].astype(int)
    out_df["min"]   = in_df["DateTime"].astype(str).str[14:].astype(int)
    
    return out_df

In [286]:
clean_data = datetime2ymd(data_frame, clean_data)

In [287]:
copy_columns = ["T", "Po", "U", "Ff", "Tn"]
clean_data[copy_columns] = data_frame[copy_columns]


In [288]:
clean_data

Unnamed: 0,day,month,year,hour,min,T,Po,U,Ff,Tn
0,31,3,2021,23,0,11.9,1010.5,67,1.0,
1,31,3,2021,22,0,12.9,1010.9,62,1.0,
2,31,3,2021,21,0,14.3,1011.0,57,1.0,
3,31,3,2021,20,0,15.3,1011.0,53,1.0,
4,31,3,2021,19,0,17.5,1011.1,45,3.0,
...,...,...,...,...,...,...,...,...,...,...
78461,26,9,2012,12,0,29.0,1002.3,Wind blowing from the south,,
78462,26,9,2012,11,0,28.0,1002.3,Wind blowing from the south-west,,
78463,26,9,2012,10,0,26.0,1002.2,Wind blowing from the south-southwest,,
78464,26,9,2012,9,0,24.0,1002.1,Wind blowing from the south-southwest,,


# Target data

In [289]:
target_df = pd.read_csv("MATF_Hackathon_2021/SviPodaci_mStanica_BeogradNBG.csv")
clean_target = pd.DataFrame()

target_df = target_df.rename(columns={"Datum_i_Vreme": "DateTime"})

target_df.head()

Unnamed: 0,DateTime,MernaStanica,B,CO,NO2,O3,PM10,PM25,SO2
0,2012-06-11 16:00,Beograd Novi Beograd,0.269729,0.590997,63.094881,53.622829,23.109444,9.742659,2.789345
1,2015-03-09 08:00,Beograd Novi Beograd,2.042423,1.113447,19.640968,56.213282,26.856319,19.293327,84.947258
2,2010-05-19 09:00,Beograd Novi Beograd,0.778962,0.905177,26.346479,,22.899085,12.057213,2.703677
3,2010-07-17 00:00,Beograd Novi Beograd,0.686226,0.994739,31.578526,44.185095,29.241251,22.910294,8.567367
4,2018-05-01 00:00,Beograd Novi Beograd,,0.25,,43.6,,,12.9


In [290]:
clean_target = datetime2ymd(target_df, clean_target)
copy_columns = ["B", "CO", "NO2", "O3", "PM10", "PM25", "SO2"]

clean_target[copy_columns] = target_df[copy_columns]
clean_target = clean_target.sort_values(by=["year", "month", "day", "hour", "min"])

clean_target.head()

ValueError: invalid literal for int() with base 10: '2-'

## Data ploting

#### By Hour

In [None]:
month = clean_target.loc[
    (clean_target["year"].isin([2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020])) &
    (clean_target["month"].isin([10, 11, 12, 1, 2]))
]

polution_by_hour = []
std_by_hour = []
for hour in range(24):
    polution_by_hour.append(month.loc[month["hour"] == hour].mean()["SO2"])
    std_by_hour.append(month.loc[month["hour"] == hour].std()["SO2"])
    
plt.errorbar(list(range(24)), polution_by_hour, std_by_hour, linestyle="--")
plt.xlabel("Hour")

#### By Month

In [None]:
years = clean_target.loc[
    (clean_target["year"].isin([2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]))
]

polution_by_month = []
std_by_month = []
for mth in range(1, 13):
    polution_by_month.append(years.loc[years["month"] == mth].mean()["SO2"])
    std_by_month.append(years.loc[years["month"] == mth].std()["SO2"])
    
plt.errorbar(list(range(1, 13)), polution_by_month, std_by_month, linestyle="--")
plt.xlabel("Month")

### Corelation

In [None]:
clean_data["T"]