In [1]:
import itertools
import pandas as pd
from sklearn.svm import SVR
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import ShuffleSplit, cross_val_score

In [2]:
dataset = pd.read_csv("data/dataset.csv")
dataset.index = dataset["id"]
del dataset["id"]
dataset.head()

Unnamed: 0_level_0,pow_uzytkowa,pow_pom_przy,llosc_izb,kondygnacja,xwsch,ypoln,cena
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,29.0,0.0,2.0,5.0,565259.3992,245541.8591,240000.0
1,28.43,0.0,1.0,3.0,565259.3992,245541.8591,250000.0
2,28.43,0.0,2.0,4.0,565259.3992,245541.8591,290000.0
3,28.58,0.0,2.561552,2.555899,565259.3992,245541.8591,223000.0
4,28.15,0.0,1.0,4.0,565259.3992,245541.8591,140000.0


In [3]:
dataset.describe()

Unnamed: 0,pow_uzytkowa,pow_pom_przy,llosc_izb,kondygnacja,xwsch,ypoln,cena
count,109919.0,109919.0,109919.0,109919.0,109919.0,109919.0,109919.0
mean,386.040718,0.944751,2.591584,2.608778,568142.028143,243794.376831,241018.7
std,1127.089687,5.357695,1.755498,2.864514,3266.135614,3374.622042,299944.7
min,0.0,0.0,0.0,-14.0,557933.7163,234335.8032,0.0
25%,37.86,0.0,2.0,1.0,565580.4079,240309.8804,78000.0
50%,51.25,0.0,2.561552,2.0,567911.6609,244822.0863,230000.0
75%,73.315,0.0,3.0,4.0,570373.5058,246722.1638,323609.4
max,36000.0,1140.0,115.0,362.6,585760.2447,250602.6161,26635810.0


In [4]:
def features_target_split(dataset):
    to_drop = ['cena']
    X = dataset.copy().drop(to_drop, axis=1)
    y = dataset["cena"]
    return (X, y)

In [5]:
columns = ['pow_uzytkowa', 'pow_pom_przy', 'llosc_izb', 'kondygnacja', 'xwsch', 'ypoln', 'cena']

def transform(df, scaler):
    new_df = df[df['cena'] > 0].copy()
    new_df[columns] = scaler.fit_transform(new_df[columns])        
    return new_df;

standard = transform(dataset, StandardScaler())
minmax = transform(dataset, MinMaxScaler())
robust = transform(dataset, RobustScaler())

In [6]:
def get_params():
    c = [0.001, 0.01, 0.1, 1]
    epsilon = [0.5, 0.4, 0.3, 0.2, 0.1, 0.08, 0.06, 0.04, 0.02, 0.01]
    gamma = [0.8, 0.5, 0.3, 0.2, 0.1, 0.08, 0.06, 0.04, 0.02, 0.01]
    params = [c, epsilon, gamma]
    return list(itertools.product(*params))

In [7]:
def cross_validation(samples, cv):
    params = get_params()
    best = (0, 0, 0, 0, 0)
    for (c, gamma, epsilon) in params:
        classifier = SVR(C=c, epsilon=epsilon, gamma=gamma, kernel='rbf')
        (X, y) = features_target_split(samples)
        scores = cross_val_score(classifier, X, y, cv=cv, n_jobs=-1, scoring="neg_mean_absolute_error")
        mean = 1 + scores.mean()
        result = (mean, scores.std() * 2, c, epsilon, gamma)
        if(mean > best[0]):
            best = result
            
#         print("%0.2f (+/-%0.2f) with { c: %0.3f, epsilon: %0.2f, gamma: %0.2f }" % result)
        
    print("Best result: %0.2f (+/-%0.2f) with { c: %0.3f, epsilon: %0.2f, gamma: %0.2f }\n\n" % best)

In [8]:
def train(dataset):
    n = 10_000 # TODO: 30_000 samples
    samples = robust.sample(n)
#   print("Using KFold cross validation with 10 splits on %d samples:" % n)
#   cross_validation(samples, 10)
    print("Using ShuffleSplit cross validation with test size 0.3 on %d samples:" % n)
    cross_validation(samples, ShuffleSplit(n_splits=10, test_size=0.3))

In [9]:
print("RobustScaler")
train(robust)
# print("MinMaxScaler")
# train(minmax)
# print("Standard")
# train(standard)

RobustScaler
Using ShuffleSplit cross validation with test size 0.3 on 10000 samples:
Best result: 0.75 (+/-0.01) with { c: 1.000, epsilon: 0.02, gamma: 0.30 }


