In [1]:
import pandas as pd
import seaborn as sb

%matplotlib inline

In [2]:
pm_10 = pd.read_csv('opendata/pm10--2011-2015.csv')
pm_25 = pd.read_csv('opendata/pm-2-5--2011-2015.csv')
no2 = pd.read_csv('opendata/no2-sofia-2011-2015.csv')

# Data cleaning

In [3]:
pm_10[pm_10["Копитото"] < 0] = 0 

In [4]:
no2[no2["Копитото"] < 0] = 0
no2[no2["Хиподрума"] < 0] = 0

In [5]:
#Data aggregation
import math
areas = ["Дружба", "Павлово", "Младост", "Надежда", "Орлов мост", "Хиподрума", "Копитото"]

pm10_dict = pm_10.to_dict("records")
no2_dict = no2.to_dict("records")
pm25_dict = pm_25.to_dict("records")

data_aggregated = []

for d in pm10_dict:
    for area in areas:
        if math.isnan(d[area]):
            continue
        data_aggregated.append([area, d["Показател"], d[area], d["Дата"]])
        
for d in no2_dict:
    for area in areas:
        if math.isnan(d[area]) or d["Показател"] == 0:
            continue
        data_aggregated.append([area, d["Показател"], d[area], d["Дата"]])

for d in pm25_dict:
    if math.isnan(d["средноденонощна  стойност"]):
        continue
    else:
        data_aggregated.append(["Хиподрума", d["Показател"], d["средноденонощна  стойност"], d["Дата"]])

In [6]:
pd_aggregated = pd.DataFrame(data_aggregated, columns=['location', 'type', 'value', 'date'])

# Seasons

In [7]:
from datetime import date, datetime


Y = 2000 # dummy leap year to allow input X-02-29 (leap day)
seasons = [('winter', (datetime(Y,  1,  1),  datetime(Y,  3, 20))),
           ('spring', (datetime(Y,  3, 21),  datetime(Y,  6, 20))),
           ('summer', (datetime(Y,  6, 21),  datetime(Y,  9, 22))),
           ('autumn', (datetime(Y,  9, 23),  datetime(Y, 12, 20))),
           ('winter', (datetime(Y, 12, 21),  datetime(Y, 12, 31)))]

def get_season(now):
    if now == 0:
        return 'autumn'
    datetime_object = datetime.strptime(now, '%d.%m.%Y')

    datetime_object = datetime_object.replace(year=Y)
    return next(season for season, (start, end) in seasons
                if start <= datetime_object <= end)

pd_aggregated["season"] = pd_aggregated['date'].apply(get_season)

In [8]:
weather_raw = pd.read_csv("./data/sofia-weather-data.csv")

In [9]:
weather = weather_raw.dropna(axis=1, how='all')

In [10]:
weather.drop(axis=1,labels=['weather_icon'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [11]:
weather["temp_cent"] = weather["temp"] - 272.15
weather["temp_max_cent"] = weather["temp_max"] - 272.15
weather["temp_min_cent"] = weather["temp_min"] - 272.15
weather['date'] = weather['dt_iso'].apply(lambda x : str.split(x)[0])
# f = {'temp_cent':['min','mean', 'max'], 'pressure':['mean'], 'humidity': ['mean'], 'wind_speed': ['min', 'mean', 'max']}

processed_weather = weather.groupby('date', as_index=False).mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [12]:
dates = pd.DataFrame(weather['date'].unique(), columns=["date"])

In [13]:
processed_weather['date_obj'] = processed_weather['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
processed_weather['date_format'] = processed_weather['date_obj'].apply(lambda x:datetime.strftime(x, '%d.%m.%Y'))


In [19]:
merged_with_date = pd.merge(processed_weather, pd_aggregated, left_on='date_format', right_on="date")

In [20]:
merged_with_date = merged_with_date.drop(axis=1, labels=['date_x', 'temp', 'dt', 'city_id', 'wind_deg', 'rain_1h', 'rain_3h',
       'rain_today', 'weather_id', 'date_obj', 'date_y'])

In [21]:
print(merged_with_date.columns)
print(merged_with_date.head())
print(merged_with_date.describe())

Index(['temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed',
       'rain_24h', 'snow_3h', 'clouds_all', 'temp_cent', 'date_format',
       'location', 'type', 'value', 'season'],
      dtype='object')
   temp_min  temp_max  pressure  humidity  wind_speed  rain_24h  snow_3h  \
0     296.3    303.15    1021.0      20.0         6.0       NaN      NaN   
1     296.3    303.15    1021.0      20.0         6.0       NaN      NaN   
2     296.3    303.15    1021.0      20.0         6.0       NaN      NaN   
3     296.3    303.15    1021.0      20.0         6.0       NaN      NaN   
4     296.3    303.15    1021.0      20.0         6.0       NaN      NaN   

   clouds_all  temp_cent date_format    location  type      value  season  
0         0.0       31.0  01.10.2012      Дружба  PM10  30.525970  autumn  
1         0.0       31.0  01.10.2012     Павлово  PM10  39.665600  autumn  
2         0.0       31.0  01.10.2012     Надежда  PM10  33.634472  autumn  
3         0.0       31.0  01.

# Shift weather one day forwars - for predictions

In [17]:
merged_with_date["temp_shifted"] = merged_with_date.temp_cent.shift(1)
merged_with_date["temp_min_shifted"] = merged_with_date.temp_min.shift(1)
merged_with_date["temp_max_shifted"] = merged_with_date.temp_max.shift(1)

#TODO