In [1]:
# Load data from https://www.worldweatheronline.com/developer/premium-api-explorer.aspx

#                                           IMPORTANT!!!
# API call are limited, this section will fail on 2nd run but all data is already available in data folder

import urllib.request
import os.path

url = "http://api.worldweatheronline.com/premium/v1/past-weather.ashx?key=fafbe372085d40f6a00140815182310&format=json&tp=24"

# Load surrounding city data, 4-4 cities from a smaller and a bigger circle around Budapest
cities = ['Budapest', 'Brno', 'Kosice', 'Timisoara', 'Zagreb', 'Nuremberg', 'Lublin', 'Bucharest', 'Sarajevo']
# Load as many year as possible from this site
years = ['2017', '2016', '2015', '2014', '2013', '2012', '2011', '2010', '2009', '2008']
# To make easier this specific task, take only data from autumn
# And API calls are limited to 500 requests, this way it only needs 9*10*4=360 + current data for prediction
dates = [['09-01', '09-30'], ['10-01', '10-31'], ['11-01', '11-30'], ['12-01', '12-31']]

# Download data from API for given cities, years and months
for city in cities:
    for year in years:
        print(city + ': ' + year)
        for i in range(len(dates)):
            citystring = '&q=' + city
            datestring = '&date=' + year + '-' + dates[i][0] + '&enddate=' + year + '-' + dates[i][1]
            page = urllib.request.urlopen(url + citystring + datestring)
            content = page.read()
            
            filename = os.path.join(*['data',"raw_data",city + '_' + year + '_' + dates[i][0]])
            f = open(filename, "wb")
            f.write(content)
            f.close()

Budapest: 2017
Budapest: 2016
Budapest: 2015
Budapest: 2014
Budapest: 2013
Budapest: 2012
Budapest: 2011
Budapest: 2010
Budapest: 2009
Budapest: 2008
Brno: 2017
Brno: 2016
Brno: 2015
Brno: 2014
Brno: 2013
Brno: 2012
Brno: 2011
Brno: 2010
Brno: 2009
Brno: 2008
Kosice: 2017
Kosice: 2016
Kosice: 2015
Kosice: 2014
Kosice: 2013
Kosice: 2012
Kosice: 2011
Kosice: 2010
Kosice: 2009
Kosice: 2008
Timisoara: 2017
Timisoara: 2016
Timisoara: 2015
Timisoara: 2014
Timisoara: 2013
Timisoara: 2012
Timisoara: 2011
Timisoara: 2010
Timisoara: 2009
Timisoara: 2008
Zagreb: 2017
Zagreb: 2016
Zagreb: 2015


IncompleteRead: IncompleteRead(15719 bytes read, 11118 more expected)

In [4]:
import json
import pandas as pd

# Extract whole json file, and save just useful things
# Concat months
# Years and cities will be still in seperate files
for city in cities:
    for year in years:
        rows_list = []
        
        day_num = 244 # 1st, Septermber is the 244th day in the months cca, approximation to date
        
        for i in range(len(dates)):         
            filename = os.path.join(*['data',"raw_data",city + '_' + year + '_' + dates[i][0]])
            with open( filename, 'rb' ) as jsonfile:
                data = json.load( jsonfile )
                
            # Take all relevant data from a day
            for day in data['data']['weather']:
                rows_list.append(
                    {'day': day_num, 
                     'maxtemp': day['maxtempC'],
                     'mintemp': day['mintempC'],
                     'windspeed': day['hourly'][0]['windspeedKmph'],
                     'winddir16Point': day['hourly'][0]['winddir16Point'],
                     'weatherCode': day['hourly'][0]['weatherCode'],
                     'precipMM': day['hourly'][0]['precipMM'],
                     'humidity': day['hourly'][0]['humidity'],
                     'pressure': day['hourly'][0]['pressure'],
                     'cloudcover': day['hourly'][0]['cloudcover'],
                     'HeatIndexC': day['hourly'][0]['HeatIndexC'],
                     'WindChillC': day['hourly'][0]['WindChillC'],
                     'WindGustKmph': day['hourly'][0]['WindGustKmph'],
                     'winddirDegree': day['hourly'][0]['winddirDegree']
                    })
                day_num += 1
            
        df = pd.DataFrame(rows_list) 
            
        new_filename = os.path.join(*['data',"preprocessed",city + '_' + year])
        df.to_csv(new_filename, sep=';', index=False)
        
# Print last city's last years data
print(df.shape)
print(df.head())

(122, 14)
  HeatIndexC WindChillC WindGustKmph cloudcover  day humidity maxtemp mintemp  \
0         16         16            9          4  244       66      28      13   
1         17         16            9         28  245       70      25      15   
2         18         17           12         23  246       67      27      13   
3         19         18           16         11  247       60      29      15   
4         19         19           19         12  248       58      29      15   

  precipMM pressure weatherCode winddir16Point winddirDegree windspeed  
0      0.0     1018         113            WNW           292         5  
1      1.8     1016         176              E            90         5  
2      0.3     1015         116            WSW           249         6  
3      0.0     1015         116             SW           231         9  
4      0.0     1016         116             SW           228        11  


In [5]:
import numpy as np

pd.set_option("max_columns",None) # Show all columns

# Concat cities
for year in years:
    year_df = pd.DataFrame()
    for city in cities:
        filename = os.path.join(*['data',"preprocessed",city + '_' + year])
        df = pd.read_csv(filename, sep=';')
        
        # Delete features which looked nice but does not help much or just make learning too complicated
        del df['weatherCode']               
        #del df['windspeed']
        del df['winddir16Point']
        #del df['precipMM']
        del df['humidity']
        #del df['pressure']
        #del df['cloudcover']
        del df['HeatIndexC']
        del df['WindChillC']
        del df['WindGustKmph']
        del df['winddirDegree']        

        if 'day' in year_df.columns: # Day feature should be concated only once
            del df['day'] 
        year_df = pd.concat([year_df, df], axis=1, sort=False) # Concat cities
        
    new_filename = os.path.join(*['data',"training_data", year])
    year_df.to_csv(new_filename, sep=';', index=False)
    
year_df.head()

Unnamed: 0,cloudcover,day,maxtemp,mintemp,precipMM,pressure,windspeed,cloudcover.1,maxtemp.1,mintemp.1,precipMM.1,pressure.1,windspeed.1,cloudcover.2,maxtemp.2,mintemp.2,precipMM.2,pressure.2,windspeed.2,cloudcover.3,maxtemp.3,mintemp.3,precipMM.3,pressure.3,windspeed.3,cloudcover.4,maxtemp.4,mintemp.4,precipMM.4,pressure.4,windspeed.4,cloudcover.5,maxtemp.5,mintemp.5,precipMM.5,pressure.5,windspeed.5,cloudcover.6,maxtemp.6,mintemp.6,precipMM.6,pressure.6,windspeed.6,cloudcover.7,maxtemp.7,mintemp.7,precipMM.7,pressure.7,windspeed.7,cloudcover.8,maxtemp.8,mintemp.8,precipMM.8,pressure.8,windspeed.8
0,28,244,25,17,0.0,1019,6,27,20,16,0.0,1018,12,15,24,16,0.0,1020,4,8,28,11,0.0,1019,6,27,25,13,0.0,1018,5,43,23,17,4.1,1016,9,6,24,13,0.0,1020,5,4,28,14,0.0,1021,6,4,28,13,0.0,1018,5
1,23,245,29,18,0.0,1016,5,21,24,19,0.0,1015,8,18,26,16,0.0,1017,6,14,29,15,0.0,1017,9,16,29,18,0.4,1016,4,31,24,14,0.0,1013,8,29,24,16,0.1,1015,9,4,29,15,0.0,1020,6,28,25,15,1.8,1016,5
2,19,246,29,19,0.2,1012,10,5,25,19,0.0,1011,11,8,27,17,0.0,1014,9,8,31,16,0.0,1015,8,11,29,18,0.4,1013,10,51,26,15,11.2,1008,12,10,27,17,0.0,1012,10,6,30,18,0.0,1017,8,23,27,13,0.3,1015,6
3,14,247,30,22,0.0,1011,13,26,24,20,0.7,1010,7,13,29,19,0.1,1012,12,12,32,19,0.0,1014,11,11,30,20,0.0,1012,13,80,20,14,17.3,1009,9,28,29,20,2.1,1010,11,4,31,18,0.0,1016,8,11,29,15,0.0,1015,9
4,15,248,32,22,0.0,1012,13,10,26,20,0.6,1010,8,19,28,20,0.0,1013,10,11,33,21,0.0,1014,11,14,31,20,0.0,1013,12,44,24,14,22.9,1007,10,37,28,19,0.9,1011,7,6,36,18,0.0,1017,4,12,29,15,0.0,1016,11


In [6]:
# Get current data
# Prediction will use 20 days to predict next day, so 20 days is enough for now
# Doing this homework on 25th, October, so last data is available from 24th, October
for city in cities:
    citystring = '&q=' + city
    datestring = '&date=2018-10-05&enddate=2018-10-24'
    page = urllib.request.urlopen(url + citystring + datestring)
    content = page.read()

    filename = os.path.join(*['data','current_data', city + '.json'])
    f = open(filename, "wb")
    f.write(content)
    f.close()

In [9]:
# Exract json, exactly the same way as on training data
for city in cities:
    rows_list = []

    day_num = 278 # 2018-10-05, first day of 20 day window

    filename = os.path.join(*['data','current_data', city + '.json'])

    with open( filename, 'rb' ) as jsonfile:
        data = json.load( jsonfile )

    for day in data['data']['weather']:
        rows_list.append(
             {'day': day_num, 
              'maxtemp': day['maxtempC'],
              'mintemp': day['mintempC'],
              'windspeed': day['hourly'][0]['windspeedKmph'],
              'winddir16Point': day['hourly'][0]['winddir16Point'],
              'weatherCode': day['hourly'][0]['weatherCode'],
              'precipMM': day['hourly'][0]['precipMM'],
              'humidity': day['hourly'][0]['humidity'],
              'pressure': day['hourly'][0]['pressure'],
              'cloudcover': day['hourly'][0]['cloudcover'],
              'HeatIndexC': day['hourly'][0]['HeatIndexC'],
              'WindChillC': day['hourly'][0]['WindChillC'],
              'WindGustKmph': day['hourly'][0]['WindGustKmph'],
              'winddirDegree': day['hourly'][0]['winddirDegree']
             })
        day_num += 1

    df = pd.DataFrame(rows_list) 

    del df['weatherCode']
    #del df['windspeed']
    del df['winddir16Point']
    #del df['precipMM']
    del df['humidity']
    #del df['pressure']
    #del df['cloudcover']
    del df['HeatIndexC']
    del df['WindChillC']
    del df['WindGustKmph']
    del df['winddirDegree']   
    
    # File that will contain all months together
    new_filename = os.path.join(*['data','current_data', city + '_preprocessed'])
    df.to_csv(new_filename, sep=';', index=False)
        
# Print last city's current data
print(df.shape)
df.head()

(20, 7)


Unnamed: 0,cloudcover,day,maxtemp,mintemp,precipMM,pressure,windspeed
0,15,278,24,8,0.0,1025,4
1,57,279,20,13,10.2,1020,7
2,61,280,20,12,5.0,1015,6
3,59,281,18,11,11.3,1019,4
4,34,282,20,11,0.0,1022,5


In [10]:
# Concat cities

pd.set_option("max_columns",None)

current_days = pd.DataFrame()
for city in cities:
    filename = os.path.join(*['data','current_data', city + '_preprocessed'])
    df = pd.read_csv(filename, sep=';')

    if 'day' in current_days.columns: # Day feature should be concated only once
        del df['day'] 
    current_days = pd.concat([current_days, df], axis=1, sort=False) # Concat cities
        
    
new_filename = os.path.join(*['data','current_data', 'concated'])
current_days.to_csv(new_filename, sep=';', index=False)
    
current_days.head()

Unnamed: 0,cloudcover,day,maxtemp,mintemp,precipMM,pressure,windspeed,cloudcover.1,maxtemp.1,mintemp.1,precipMM.1,pressure.1,windspeed.1,cloudcover.2,maxtemp.2,mintemp.2,precipMM.2,pressure.2,windspeed.2,cloudcover.3,maxtemp.3,mintemp.3,precipMM.3,pressure.3,windspeed.3,cloudcover.4,maxtemp.4,mintemp.4,precipMM.4,pressure.4,windspeed.4,cloudcover.5,maxtemp.5,mintemp.5,precipMM.5,pressure.5,windspeed.5,cloudcover.6,maxtemp.6,mintemp.6,precipMM.6,pressure.6,windspeed.6,cloudcover.7,maxtemp.7,mintemp.7,precipMM.7,pressure.7,windspeed.7,cloudcover.8,maxtemp.8,mintemp.8,precipMM.8,pressure.8,windspeed.8
0,1,278,21,12,0.0,1028,7,1,17,7,0.0,1026,14,1,14,7,0.0,1029,5,0,20,9,0.0,1028,6,6,21,12,0.0,1026,4,0,20,8,0.0,1021,8,21,16,7,0.0,1027,13,2,21,11,0.0,1030,12,15,24,8,0.0,1025,4
1,13,279,23,12,0.0,1020,12,6,20,10,0.0,1017,21,6,17,7,0.0,1022,7,2,26,13,0.0,1022,10,32,23,12,0.0,1018,3,10,22,10,0.0,1013,8,3,19,8,0.0,1019,18,0,21,13,0.0,1026,10,57,20,13,10.2,1020,7
2,39,280,24,15,0.1,1015,7,48,20,13,0.0,1014,10,22,19,11,0.0,1016,7,9,26,14,0.0,1016,7,56,21,13,0.2,1014,4,35,22,12,0.5,1015,8,47,20,13,0.2,1014,17,5,23,12,0.0,1019,9,61,20,12,5.0,1015,6
3,30,281,24,15,0.0,1019,6,13,19,10,0.0,1022,12,34,18,12,1.5,1021,13,8,25,15,0.0,1018,8,64,22,14,1.2,1020,5,14,22,10,0.0,1021,6,27,14,7,0.0,1023,9,0,22,16,0.0,1019,6,59,18,11,11.3,1019,4
4,6,282,26,16,0.0,1023,5,4,21,10,0.0,1024,6,6,21,11,0.0,1024,5,5,24,15,0.0,1023,5,28,22,13,0.0,1023,5,10,22,12,0.0,1022,6,8,15,6,0.0,1026,7,2,23,15,0.0,1024,12,34,20,11,0.0,1022,5
