In [None]:
print(__doc__)

import numpy as np
from sklearn.svm import SVR
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

# read and normalize the messy columns from weather data

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import datetime 

matplotlib.style.use('ggplot')

# adjust all plot sizes to max size for juypter notebook
matplotlib.rcParams['figure.figsize'] = (20.0, 10.0)

# read data
stringname = "train"
weather1 = pd.read_csv('../data/raw/weather_dwd_' + stringname + '_set.csv')
weather2 = pd.read_csv('../data/raw/weather_uni_osnabrueck_' + stringname + '_set.csv')
weather2test = pd.read_csv('../data/raw/weather_uni_osnabrueck_test_set.csv')

# weather information is messy, clean it up 
# use weather1 for days 0-2158 and weather2 from 537 days after start
w2start = 537
weatherX1 = weather1.ix[:2158,0:2].join(weather1.ix[:2158,3])
weatherX2 = weather2.ix[:,0:7]

# this is a cool command: apply the function ("lambda") to all values of column "date" 
weatherX2['date'] = weatherX2['date'].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").strftime('%-m/%-d/%Y') )
weatherX2 = weatherX2.ix[w2start:,0:2].join(weatherX2.ix[w2start:,4])

weather2test = weather2test.ix[:,0:2].join(weather2test.ix[:,4])

weatherX1.columns = ['date', 'humidity', 'temperature']
weatherX2.columns = ['date', 'humidity', 'temperature']
weather2test.columns = ['date', 'humidity', 'temperature']
superWeather = pd.concat([weatherX1, weatherX2, weather2test])

# save consolidated weather to csv
#superWeather.to_csv("weather.csv", sep=',', index=False)

# load consolidated weather from csv
#superWeather.from_csv("weather.csv", sep=',')

superWeather.plot()

In [None]:
weatherX1.tail()

In [None]:
# load visitor data from nettebad and the good columns from weather data, and the cleaned up messy columns from weather data

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')

def combine_input_files(stringname):
    train_orig = pd.read_csv('../data/raw/nettebad_' + stringname + '_set.csv')

    weather1 = pd.read_csv('../data/raw/weather_dwd_' + stringname + '_set.csv')

    weather2 = pd.read_csv('../data/raw/weather_uni_osnabrueck_' + stringname + '_set.csv')

    weatherX1 = weather1.ix[:,5:7]
    weatherX2 = weather1.ix[:,0]
    weatherX = weatherX1.join(weatherX2)
    
    train = pd.merge(train_orig, weatherX, on='date')
    return train 

traindata = combine_input_files("train")
traindata = pd.merge(traindata, superWeather, on="date")
testdata = combine_input_files("test")
testdata = pd.merge(testdata, superWeather, on="date")

testdata.to_csv("testdata.csv", sep=',', index=False)

traindata.to_csv("traindata.csv", sep=',', index=False)
traindata.head()


In [None]:
testdata.head()

In [None]:
# transform to feature vectors

import datetime 

def get_feature_vector(train):
    X1 = train.ix[:,2:11]
    X2 = train.ix[:,13:]
    y = train.ix[:,1]
    datums = train.ix[:,0]
    wochentag = []
    monat = []

    for idx in range(len(X1)):
        datum = datetime.datetime.strptime(datums[idx], "%m/%d/%Y") 
        #print datums[idx], datum.strftime("%Y-%m-%d")
        wochentag.append(datum.isoweekday())
        monat.append(int(datum.strftime("%m")))

    dat1 = pd.DataFrame({'Monat': monat})
    dat2 = pd.DataFrame({'Wochentag': wochentag})
    Xre = X1.join(X2).join(dat1).join(dat2)
    return [Xre,y]

[X, y] = get_feature_vector(traindata)

X.head()

In [None]:
# same for test set

def get_test_vector(train):
    X1 = train.ix[:,1:10]
    X2 = train.ix[:,12:]
    datums = train.ix[:,0]
    wochentag = []
    monat = []

    for idx in range(len(X1)):
        datum = datetime.datetime.strptime(datums[idx], "%m/%d/%Y") 
        #print datums[idx], datum.strftime("%Y-%m-%d")
        wochentag.append(datum.isoweekday())
        monat.append(int(datum.strftime("%m")))

    dat1 = pd.DataFrame({'Monat': monat})
    dat2 = pd.DataFrame({'Wochentag': wochentag})
    Xre = X1.join(X2).join(dat1).join(dat2)
    return [Xre, datums]

[X_test, dates_predict_set] = get_test_vector(testdata)
print(len(X_test))
X_test.head()



In [None]:
X.tail()

In [None]:
# print dataset

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (20.0, 10.0)
X.plot()


In [None]:
# reduce the set size, remove nan
from sklearn import preprocessing

startAt = 0

Xneu = X.ix[startAt:]
yneu = y.ix[startAt:]
# fix nan values
Xneu = Xneu.interpolate()

# we dont normalize. gives worse results than unnormalized data
#Xneu = preprocessing.normalize(Xneu)
Xneu.plot()


In [None]:
# TRAINING NOW (v1)

In [None]:
# split test set

# Python 3 syntax in Python 2
from __future__ import division
from __future__ import print_function

# Scikit learn
from sklearn import cross_validation
from sklearn import grid_search
from sklearn import svm
from sklearn import metrics


train, test, target_train, target_test = \
    cross_validation.train_test_split(
        Xneu,
        yneu.as_matrix(), 
        test_size=0.2)
    
print( len(train) )
print( len(test) )

print( len(target_train) )
print( len(target_test) )

target_test

In [None]:
# TRaINING V2 (poly
# poly kernel is super slow, and doesnt give better results than rbf. dont use it

import numpy as np


# result: 343
#  startAt 2500
#  Parameters: {'kernel': 'poly', 'C': 100.0, 'degree': 2}


# SVM needs some initial configuration parameters 
# ("hyperparameters") 

# Good hyperparameters can be obtained 
# by using grid search & "cross validation"

# But we need to define the parameter search space

parameters = {'kernel': ['poly'], 
              "C": [1e1, 1e2, 1e3],
              'degree': [2],
             # "gamma": np.logspace(-2, 2, 5)
             }

#   rbf = radial basis function = gauss kernel
#   Other kernels are rarely needed, said Andrew NG 
#   (Associate Professor of Computer Science at Stanford, founder Coursera)


# Search for the best classifier within the search space 
# A search consists of:
#   an estimator (regressor or classifier such as sklearn.svm.SVC());
#   a parameter space;
#   ...
# Documentation: 
#   http://scikit-learn.org/stable/modules/grid_search.html

clf = grid_search.GridSearchCV(svm.SVR(), parameters)
clf.fit(train, target_train)
classifier = clf.best_estimator_

print("Set sizes:")
print( len(train) )
print( len(test) )

print()
print (metrics.mean_squared_error(target_test,classifier.predict(test)))
print('Parameters:', clf.best_params_)



In [None]:
# Training V4 - KNN

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()
classifier.fit(Xneu, yneu)

print (metrics.mean_squared_error(target_test,classifier.predict(test)))
classifier

In [None]:
# linear regression with gradient boosting. best results!

print("Set sizes:")
print( len(train) )
print( len(test) )

from sklearn import linear_model
from sklearn import metrics
from sklearn import ensemble
# classifier = linear_model.LinearRegression()
#classifier = linear_model.Ridge(alpha=1.0)

params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.04, 'loss': 'ls'}


parameters = {'max_depth': [ 4,5,6 ]}


#classifier = ensemble.GradientBoostingRegressor(**params)
#classifier.fit(Xneu, yneu)

clf = grid_search.GridSearchCV(ensemble.GradientBoostingRegressor(**params), parameters)
clf.fit(train, target_train)
classifier = clf.best_estimator_


print('Parameters:', clf.best_params_)

MSE = metrics.mean_squared_error(target_test,classifier.predict(test))
print (MSE**(0.5))

classifier

In [None]:
from sklearn.linear_model import Lasso
classifier = Lasso()
classifier.fit(Xneu, yneu)

MSE = metrics.mean_squared_error(target_test,classifier.predict(test))
print (MSE**(0.5))

classifier


In [None]:
# TRaINING V3: SVR with rbf kernel

import numpy as np

print("Set sizes:")
print( len(train) )
print( len(test) )

# SVM needs some initial configuration parameters 
# ("hyperparameters") 

# Good hyperparameters can be obtained 
# by using grid search & "cross validation"

# But we need to define the parameter search space

parameters = {'kernel': ['rbf'], 
              'C': [  100, 1000, 10000 ],
              'gamma': [ 0.1, 0.01, 0.001, 0.0001]}

#   rbf = radial basis function = gauss kernel
#   Other kernels are rarely needed, said Andrew NG 
#   (Associate Professor of Computer Science at Stanford, founder Coursera)


# Search for the best classifier within the search space 
# A search consists of:
#   an estimator (regressor or classifier such as sklearn.svm.SVC());
#   a parameter space;
#   ...
# Documentation: 
#   http://scikit-learn.org/stable/modules/grid_search.html

clf = grid_search.GridSearchCV(svm.SVR(), parameters)
clf.fit(train, target_train)
classifier = clf.best_estimator_


print()
print (metrics.mean_squared_error(target_test,classifier.predict(test)))

print('Parameters:', clf.best_params_)



In [None]:

print()
print('Parameters:', clf.best_params_)
print()
print('Best classifier score')
print(metrics.classification_report(
        target_test,
        classifier.predict(test).astype(int)))



# Analyze Training Results

In [None]:
%matplotlib inline

predict_result = classifier.predict(X_test).astype(int)
print (metrics.mean_squared_error(target_test,classifier.predict(test)))

# draw regressor
matplotlib.rcParams['figure.figsize'] = (20.0, 10.0)

# print predict_result 
ts = pd.Series(predict_result)
ts.plot()


In [None]:
# save result csv file for upload to kaggle

# [dates_predict_set, predict_result]

filename = "resultGlinear6b.csv"
final_set = []
target = open(filename, 'w')
target.truncate()
target.write('date,visitors_pool_total')
target.write("\n")

for idx in range(len(dates_predict_set)):
    final_set.append( [dates_predict_set[idx], predict_result[idx]] )
    target.write( dates_predict_set[idx] + "," + str( predict_result[idx]) )
    target.write("\n")

target.close()

final_set        

In [None]:
%matplotlib inline

lw = 2
plt.scatter(X, y, color='darkorange', label='data')
plt.hold('on')
plt.plot(X, y_rbf, color='navy', lw=lw, label='RBF model')
plt.plot(X, y_lin, color='c', lw=lw, label='Linear model')
#plt.plot(X, y_poly, color='cornflowerblue', lw=lw, label='Polynomial model')
plt.xlabel('data')
plt.ylabel('target')
plt.title('Support Vector Regression')
plt.legend()
plt.show()