In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

%matplotlib inline

In [2]:
train_path = '/home/bilalcelebi/Workspace/notebooks/data/regression/train.csv'
test_path = '/home/bilalcelebi/Workspace/notebooks/data/regression/test.csv'

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [4]:
train.drop('Id', axis = 1, inplace = True)
test.drop('Id', axis = 1, inplace = True)

In [5]:
train_na = dict()
test_na = dict()

for column in train.columns:
    train_na[column] = round(train[column].isna().sum() / len(train[column]) * 100.0, 0)

for column in test.columns:
    test_na[column] = round(test[column].isna().sum() / len(test[column]) * 100.0 , 0)

train_na = pd.Series(train_na)
test_na = pd.Series(test_na)

In [6]:
train_na.sort_values(ascending = False)

PoolQC         100.0
MiscFeature     96.0
Alley           94.0
Fence           81.0
FireplaceQu     47.0
               ...  
ExterCond        0.0
ExterQual        0.0
Exterior2nd      0.0
Exterior1st      0.0
SalePrice        0.0
Length: 80, dtype: float64

In [7]:
test_na.sort_values(ascending = False)

PoolQC           100.0
MiscFeature       97.0
Alley             93.0
Fence             80.0
FireplaceQu       50.0
                 ...  
Heating            0.0
MSZoning           0.0
CentralAir         0.0
Electrical         0.0
SaleCondition      0.0
Length: 79, dtype: float64

Kategorilerin NULL verilerinin toplam veriye oranı. Görüldüğü üzere aslında veri setinde aşırı gereksiz kategoriler bile bulunmakta diyebilirim. 

In [8]:
train_na_columns = train.isna().sum()
train_na_columns = train_na_columns[train_na_columns.values > 0].index.to_list()
test_na_columns = test.isna().sum()
test_na_columns = test_na_columns[test_na_columns.values > 0].index.to_list()

In [9]:
train.drop(train_na_columns, axis = 1, inplace = True)
test.drop(test_na_columns, axis = 1, inplace = True)

In [10]:
train_columns = [column for column in train.columns if column in test.columns]
train_columns.append('SalePrice')
train = train[train_columns]

In [11]:
encoder = LabelEncoder()
cat_columns = [column for column in train.columns if train[column].dtype == 'object']

for column in cat_columns:
    
    train[column] = encoder.fit_transform(train[column])

In [12]:
X = train.drop('SalePrice', axis = 1)
y = train['SalePrice']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 125, test_size = 0.3, shuffle = True)

In [14]:
model = LinearRegression()
model.fit(X_train,y_train)

In [15]:
preds = model.predict(X_test)
mse = mean_squared_error(preds, y_test)
mse = np.sqrt(mse)
print(f'Mean Squared Error : {mse}')
r2 = r2_score(preds, y_test)
print(f'R2-Score : {r2}')
mae = mean_absolute_error(preds, y_test)
print(f'Mean Absoulote Error : {mae}')

Mean Squared Error : 35144.568678835436
R2-Score : 0.7496159184048397
Mean Absoulote Error : 22747.038811565602


In [16]:
test.drop('Electrical', axis = 1, inplace = True)

In [17]:
test.columns = [column for column in test.columns if column in X_train.columns]
test_cat_cols = [column for column in test.columns if test[column].dtype == 'object']

In [18]:
for column in test_cat_cols:
    test[column] = encoder.fit_transform(test[column])

In [19]:
test_pred = model.predict(test)

In [20]:
test['SalePrice'] = test_pred.round(1)

In [21]:
test.to_csv('submission.csv')

In [22]:
test

Unnamed: 0,MSSubClass,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleCondition,SalePrice
0,20,11622,1,3,3,4,0,12,1,2,...,0,0,0,120,0,0,6,2010,4,122796.8
1,20,14267,1,0,3,0,0,12,2,2,...,36,0,0,0,0,12500,6,2010,4,149167.3
2,60,13830,1,0,3,4,0,8,2,2,...,34,0,0,0,0,0,3,2010,4,161142.7
3,60,9978,1,0,3,4,0,8,2,2,...,36,0,0,0,0,0,6,2010,4,192474.1
4,120,5005,1,0,1,4,0,22,2,2,...,82,0,0,144,0,0,1,2010,4,200670.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,1936,1,3,3,4,0,10,2,2,...,0,0,0,0,0,0,6,2006,4,77072.0
1455,160,1894,1,3,3,4,0,10,2,2,...,24,0,0,0,0,0,4,2006,0,53861.9
1456,20,20000,1,3,3,4,0,11,2,2,...,0,0,0,0,0,0,9,2006,0,146955.8
1457,85,10441,1,3,3,4,0,11,2,2,...,32,0,0,0,0,700,7,2006,4,120508.1
