In [124]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

%matplotlib inline

In [125]:
train_path = '/home/bilalcelebi/Workspace/notebooks/data/regression/train.csv'
test_path = '/home/bilalcelebi/Workspace/notebooks/data/regression/test.csv'

In [126]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [127]:
train.drop('Id', axis = 1, inplace = True)
test.drop('Id', axis = 1, inplace = True)

In [128]:
train_na = dict()
test_na = dict()

for column in train.columns:
    train_na[column] = round(train[column].isna().sum() / len(train[column]) * 100.0, 0)

for column in test.columns:
    test_na[column] = round(test[column].isna().sum() / len(test[column]) * 100.0 , 0)

train_na = pd.Series(train_na)
test_na = pd.Series(test_na)

In [129]:
train_na.sort_values(ascending = False)

PoolQC         100.0
MiscFeature     96.0
Alley           94.0
Fence           81.0
FireplaceQu     47.0
               ...  
ExterCond        0.0
ExterQual        0.0
Exterior2nd      0.0
Exterior1st      0.0
SalePrice        0.0
Length: 80, dtype: float64

In [130]:
test_na.sort_values(ascending = False)

PoolQC           100.0
MiscFeature       97.0
Alley             93.0
Fence             80.0
FireplaceQu       50.0
                 ...  
Heating            0.0
MSZoning           0.0
CentralAir         0.0
Electrical         0.0
SaleCondition      0.0
Length: 79, dtype: float64

Kategorilerin NULL verilerinin toplam veriye oranı. Görüldüğü üzere aslında veri setinde aşırı gereksiz kategoriler bile bulunmakta diyebilirim. 

In [131]:
train_na_columns = train.isna().sum()
train_na_columns = train_na_columns[train_na_columns.values > 0].index.to_list()
test_na_columns = test.isna().sum()
test_na_columns = test_na_columns[test_na_columns.values > 0].index.to_list()

In [132]:
train.drop(train_na_columns, axis = 1, inplace = True)
test.drop(test_na_columns, axis = 1, inplace = False)

KeyError: "['MSZoning', 'LotFrontage', 'Alley', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType'] not found in axis"

In [111]:
train_columns = [column for column in train.columns if column in test.columns]
train_columns.append('SalePrice')
train = train[train_columns]

In [112]:
encoder = LabelEncoder()
cat_columns = [column for column in train.columns if train[column].dtype == 'object']

for column in cat_columns:
    
    train[column] = encoder.fit_transform(train[column])

In [113]:
X = train.drop('SalePrice', axis = 1)
y = train['SalePrice']

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 125, test_size = 0.3, shuffle = True)

In [115]:
model = LinearRegression()
model.fit(X_train,y_train)

In [116]:
preds = model.predict(X_test)
mse = mean_squared_error(preds, y_test)
mse = np.sqrt(mse)
print(f'Mean Squared Error : {mse}')
r2 = r2_score(preds, y_test)
print(f'R2-Score : {r2}')
mae = mean_absolute_error(preds, y_test)
print(f'Mean Absoulote Error : {mae}')

Mean Squared Error : 32764.29616801839
R2-Score : 0.7868609474156667
Mean Absoulote Error : 20488.38269682643


In [118]:
for column in test.columns:
    test[column] = encoder.fit_transform(test[column])

In [119]:
test_pred = model.predict(test)

Feature names unseen at fit time:
- Electrical
Feature names seen at fit time, yet now missing:
- BsmtFinSF1
- BsmtFinSF2
- BsmtFullBath
- BsmtHalfBath
- BsmtUnfSF
- ...



ValueError: could not convert string to float: 'Pave'

In [73]:
test['SalePrice'] = test_pred.round(1)

In [74]:
test.to_csv('submission.csv')

In [75]:
test

Unnamed: 0,MSSubClass,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleCondition,SalePrice
0,0,777,1,3,3,4,0,12,1,2,...,0,0,0,18,0,0,5,4,4,1666500.1
1,0,964,1,0,3,0,0,12,2,2,...,24,0,0,0,0,24,5,4,4,1701929.7
2,5,947,1,0,3,4,0,8,2,2,...,22,0,0,0,0,0,2,4,4,1698766.7
3,5,587,1,0,3,4,0,8,2,2,...,24,0,0,0,0,0,5,4,4,1729859.6
4,11,118,1,0,1,4,0,22,2,2,...,68,0,0,27,0,0,0,4,4,1748337.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,13,19,1,3,3,4,0,10,2,2,...,0,0,0,0,0,0,5,0,4,1642052.8
1455,13,17,1,3,3,4,0,10,2,2,...,12,0,0,0,0,0,3,0,0,1619124.2
1456,0,1069,1,3,3,4,0,11,2,2,...,0,0,0,0,0,0,8,0,0,1681944.6
1457,9,647,1,3,3,4,0,11,2,2,...,20,0,0,0,0,12,6,0,4,1676775.6
