In [121]:
#import relevant libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as pyplot
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.preprocessing import MinMaxScaler


In [122]:
dataset = pd.read_csv("dataset.csv")
dataset.head()	#to verify it actually loaded & visualise the first 5 rows.

# print the columns names
for col in dataset.columns:
    print(col)

#print the number of rows
print (len(dataset))

price
bedrooms
bathrooms
sqft_living
sqft_lot
floors
waterfront
view
condition
grade
sqft_basement
yr_built
yr_renovated
property_typ
21613


In [123]:
def getMissingRowsPerColumn (df):
        # count rows with missing values
        missingValues = df.isnull().sum()
        
        # compute the percentage of missing values
        percentageMissingValues = (missingValues*100) / len(df)
        
        # create table with the results
        table = pd.concat([missingValues, percentageMissingValues], axis=1)
        
        # Rename the columns
        renamedTable = table.rename(columns = {0 : 'Missing Values', 1 : '% tot vals'})
        
        # Sort descending the table by % of missing values
        renamedTable = renamedTable [renamedTable.iloc[:,1] != 0].sort_values('% tot vals', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(renamedTable.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return renamedTable

getMissingRowsPerColumn (dataset)


Your selected dataframe has 14 columns.
There are 2 columns that have missing values.


Unnamed: 0,Missing Values,% tot vals
yr_renovated,16604,76.8
grade,114,0.5


In [124]:
dataset = dataset.drop(['yr_renovated'], axis=1)
dataset.head()#to verify the column has been actually dropped

getMissingRowsPerColumn (dataset)



Your selected dataframe has 13 columns.
There are 1 columns that have missing values.


Unnamed: 0,Missing Values,% tot vals
grade,114,0.5


In [125]:
#impute values
dataset['grade'] = dataset['grade'].fillna(dataset['grade'].mean())

getMissingRowsPerColumn (dataset)


Your selected dataframe has 13 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% tot vals


In [126]:
#select categorical columns only
object_columns = list(dataset.select_dtypes(include=['object']).columns)
categorical = dataset[object_columns]

print('There are ' + str(categorical.shape[1]) + ' categorical columns within dataframe:')

print(dataset[object_columns].head())

There are 3 categorical columns within dataframe:
  waterfront view         property_typ
0         no  bad            apartment
1         no  bad  semi_detached_house
2         no  bad            apartment
3         no  bad  semi_detached_house
4         no  bad            apartment


In [127]:
#print the values for each categorical variable
print('--------->Waterfront:')
print(dataset['waterfront'].value_counts())

print('--------->View:')
print(dataset['view'].value_counts())

print('--------->Property_typ:')
print(dataset['property_typ'].value_counts())

--------->Waterfront:
no     21450
yes      163
Name: waterfront, dtype: int64
--------->View:
bad          19489
good           963
very_good      510
medium         332
excellent      319
Name: view, dtype: int64
--------->Property_typ:
apartment                 12976
semi_detached_house        6728
single_family_house        1498
multi_generation_house      230
top_floor_apartment         181
Name: property_typ, dtype: int64


In [128]:
#encode categorical features

encoder = LabelBinarizer()
waterfrontEncoder = encoder.fit_transform(dataset.waterfront.values.reshape(-1,1))

# Insert coded waterfront feature into the original dataset and delete the old one
dataset['waterfrontEncoded'] = waterfrontEncoder = encoder.fit_transform(dataset.waterfront.values.reshape(-1,1))
dataset = dataset.drop(['waterfront'], axis=1)

# Create a dictionary about how the observations related to the variable 'view' should be coded
dictionary = {'bad' : 0,
             'medium' : 1,
             'good' : 2,
             'very_good' : 3,
             'excellent' : 4}

# Map the dictionary on the column view and store the results in a new column, and delete the old one
dataset['view_encoded'] = dataset.view.map(dictionary)
dataset = dataset.drop(['view'], axis=1)

encoder = OneHotEncoder()
OHE = encoder.fit_transform(dataset.property_typ.values.reshape(-1,1)).toarray()

# Conversion of the newly generated data to a dataframe
df_OHE = pd.DataFrame(OHE, columns = ["property_typ" + str(encoder.categories_[0][i]) 
                                     for i in range(len(encoder.categories_[0]))])

# Insertion of the coded features into the original data set and removal of the old one
dataset = pd.concat([dataset, df_OHE], axis=1)
dataset = dataset.drop(['property_typ'], axis=1)

dataset.head()

# print the columns names
for col in dataset.columns:
    print(col)



price
bedrooms
bathrooms
sqft_living
sqft_lot
floors
condition
grade
sqft_basement
yr_built
waterfrontEncoded
view_encoded
property_typapartment
property_typmulti_generation_house
property_typsemi_detached_house
property_typsingle_family_house
property_typtop_floor_apartment


In [129]:
#learn a regression model to fit the feature price from the other features
y = dataset.iloc[:,0].values  #dependent feature
X = dataset.iloc[:, 1:].values #independent features

y = y.reshape(-1, 1)

sc_x=MinMaxScaler()
sc_y=MinMaxScaler()
X = sc_x.fit_transform(X)
y = sc_y.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=14)

regressor = SVR(kernel = 'rbf')
regressor.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [130]:
yPred = regr.predict(X_test)

mse = mean_squared_error(y_test,yPred)
rmse = np.sqrt(mse)
print(rmse)


0.32338882061593943
