In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [3]:
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

In [4]:
for column in train_df.columns:
    if column in test_df.columns or column=='SalePrice':
        train_df[column] = train_df[column].fillna(train_df[column].median())
    else:
        train_df = train_df.drop(column, axis=1)
for column in test_df.columns:
    if column in train_df.columns:
        test_df[column] = test_df[column].fillna(test_df[column].median())
    else:
        test_df = test_df.drop(column, axis=1)

In [5]:
X_train = train_df.drop("SalePrice", axis=1)
Y_train = train_df["SalePrice"].values
X_test = test_df.copy()
X_train.shape, Y_train.shape, X_test.shape

((1460, 271), (1460,), (1459, 271))

In [6]:
#Linear Regression

linreg = LinearRegression()
linreg.fit(X_train, Y_train)
Y_pred = linreg.predict(X_test)
acc_log = round(linreg.score(X_train, Y_train) * 100, 2)
acc_log

91.719999999999999

In [7]:
coeff_df = pd.DataFrame(train_df.columns.delete(0))
coeff_df.columns = ['Feature']
coeff_df['Correlation'] = pd.Series(linreg.coef_[0])

coeff_df.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
0,MSSubClass,0.433041
1,LotFrontage,
2,LotArea,
3,OverallQual,
4,OverallCond,
5,YearBuilt,
6,YearRemodAdd,
7,MasVnrArea,
8,BsmtFinSF1,
9,BsmtFinSF2,


In [8]:
#Support Vector Machines (SVM)
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

100.0

In [9]:
#K-nearest (k-NN)
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

33.009999999999998

In [10]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

76.510000000000005

In [11]:
#Perceptron
perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

1.1000000000000001

In [12]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

2.1899999999999999

In [13]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

0.27000000000000002

In [14]:
# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

100.0

In [15]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

100.0

In [16]:
models = pd.DataFrame({
        'Model': ['Support Vector Machines', 'KNN', 'Linear Regression', 'Random Forest', 'Naive Bayes', 'Perceptron',
                  'Stochastic Gradient Descent', 'Linear SVC', 'Decision Tree'],
        'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree]
    })
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
0,Support Vector Machines,100.0
3,Random Forest,100.0
8,Decision Tree,100.0
2,Linear Regression,91.72
4,Naive Bayes,76.51
1,KNN,33.01
7,Linear SVC,2.19
5,Perceptron,1.1
6,Stochastic Gradient Descent,0.27


In [18]:
submission = pd.DataFrame({
        "Id": test_df["Id"],
        "SalePrice": Y_pred
    })
submission.to_csv('./submission.csv', index=False)
#print(submission)