In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

In [2]:
# Constants
TRAIN_FILE = "train.csv"
DELIMITER = ","
# Number of distinct values in a column below which we treat it as a categorical column
CATEGORICAL_THRESHOLD = 30
CATEGORY_DTYPE = 'category'
HOUSE_ID = 'Id'
SALE_PRICE = 'SalePrice'

In [3]:
# Read data from train file
train_df = pd.read_csv(TRAIN_FILE, DELIMITER)
# Retain house ids
ids = train_df[HOUSE_ID]
# Retain sale-prices
prices = train_df[SALE_PRICE]

In [4]:
# Get all columns that have less than a set threshold number of distinct values
CATEGORICAL_COLUMNS = train_df.T[train_df.apply(pd.Series.nunique) < CATEGORICAL_THRESHOLD].T.columns

In [5]:
# Convert all identified categorical columns to dtype: category
for cat in CATEGORICAL_COLUMNS:
    train_df[cat] = train_df[cat].astype(CATEGORY_DTYPE)

In [6]:
# Convert values of each categorical column to a numeric-category
train_df[CATEGORICAL_COLUMNS] = train_df[CATEGORICAL_COLUMNS].apply(lambda x: x.cat.codes)

In [15]:
# Normalize resulting data-frame
train_df = (train_df - train_df.mean()) / (train_df.max() - train_df.min())
# Restore retained columns
train_df[HOUSE_ID] = ids
train_df[SALE_PRICE] = prices
# Replace NaN values by mean of each column
train_df = train_df.fillna(train_df.mean())

In [46]:
# Split data into train and test sets (70-30)
X, Y = train_df.ix[:, 1:-1], train_df.ix[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [66]:
# Train Support Vector Machine
clf = SVR(C=100.0, epsilon=0.2)
clf = clf.fit(X_train, y_train)

In [67]:
# Get R^2 score
clf.score(X_test, y_test)

-0.018644874325486338