In [147]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

### Dataset

In [148]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [149]:
train.shape,test.shape

((1460, 81), (1459, 80))

In [150]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Exploratory data analysis

In [151]:
data = train.copy()
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [152]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [153]:
# Belirli bir seviyeden fazla missing value olduğu için datasetten atmamız gereken sütunlar var.
data.drop(["PoolQC","Fence","MiscFeature","FireplaceQu","Alley"],axis=1,inplace=True)
test.drop(["PoolQC","Fence","MiscFeature","FireplaceQu","Alley"],axis=1,inplace=True)
test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,120,0,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,IR1,Lvl,AllPub,Corner,...,36,0,0,0,0,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,IR1,Lvl,AllPub,Inside,...,34,0,0,0,0,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,IR1,Lvl,AllPub,Inside,...,36,0,0,0,0,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,IR1,HLS,AllPub,Inside,...,82,0,0,144,0,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,Reg,Lvl,AllPub,Inside,...,24,0,0,0,0,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,Reg,Lvl,AllPub,Inside,...,32,0,0,0,0,700,7,2006,WD,Normal


In [154]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,50.0,70.0,190.0
LotFrontage,1201.0,70.049958,24.284752,21.0,59.0,69.0,80.0,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,7553.5,9478.5,11601.5,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,5.0,6.0,7.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,5.0,6.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1954.0,1973.0,2000.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1967.0,1994.0,2004.0,2010.0
MasVnrArea,1452.0,103.685262,181.066207,0.0,0.0,0.0,166.0,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,0.0,383.5,712.25,5644.0


# Missing Values

In [155]:
null_degerler = data.isnull().sum()
null_kolonlar = null_degerler[null_degerler != 0]
print("Null value içeren sütunlar :")
null_kolonlar

Null value içeren sütunlar :


LotFrontage     259
MasVnrType        8
MasVnrArea        8
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Electrical        1
GarageType       81
GarageYrBlt      81
GarageFinish     81
GarageQual       81
GarageCond       81
dtype: int64

# Handling Missing Values...

In [156]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

#### Train dataset için

In [157]:
imputer_most_frequent = SimpleImputer(strategy="most_frequent")
imputer_median = SimpleImputer(strategy="median")

In [158]:
data["LotFrontage"] = imputer_median.fit_transform(data[["LotFrontage"]])
data["MasVnrType"] = data["MasVnrType"].fillna('None')  # Duvar tipi belli değil onun için None.
data["MasVnrArea"] = imputer_median.fit_transform(data[["MasVnrArea"]])
data["BsmtQual"] =   data["BsmtQual"].fillna('None')   #No basement olduğıu için  None yazcaz.
data["BsmtCond"] =   data["BsmtCond"].fillna('None')   #No basement olduğıu için  None yazcaz.
data["BsmtExposure"] =   data["BsmtExposure"].fillna('None')   #No basement olduğıu için  None yazcaz.
data["BsmtFinType1"] =   data["BsmtFinType1"].fillna('None')   #No basement olduğıu için  None yazcaz.
data["BsmtFinType2"] =   data["BsmtFinType2"].fillna('None')  #No basement olduğıu için  None yazcaz.
data["Electrical"] = imputer_most_frequent.fit_transform(data[["Electrical"]])
data["GarageType"] = data["GarageType"].fillna('None')
data["GarageFinish"] = data["GarageFinish"].fillna('None')
data["GarageQual"] = data["GarageQual"].fillna('None')
data["GarageYrBlt"] = imputer_median.fit_transform(data[["GarageYrBlt"]])
data["GarageCond"] = data["GarageCond"].fillna('None')

#### Test Dataset için

In [159]:
test_na_degerler = test.isna().sum()
test_na = test_na_degerler[test_na_degerler != 0]
test_na.keys()

Index(['MSZoning', 'LotFrontage', 'Utilities', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual',
       'Functional', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars',
       'GarageArea', 'GarageQual', 'GarageCond', 'SaleType'],
      dtype='object')

In [160]:
for i in test_na.keys():
    print(i," : ",data[i].dtypes)

MSZoning  :  object
LotFrontage  :  float64
Utilities  :  object
Exterior1st  :  object
Exterior2nd  :  object
MasVnrType  :  object
MasVnrArea  :  float64
BsmtQual  :  object
BsmtCond  :  object
BsmtExposure  :  object
BsmtFinType1  :  object
BsmtFinSF1  :  int64
BsmtFinType2  :  object
BsmtFinSF2  :  int64
BsmtUnfSF  :  int64
TotalBsmtSF  :  int64
BsmtFullBath  :  int64
BsmtHalfBath  :  int64
KitchenQual  :  object
Functional  :  object
GarageType  :  object
GarageYrBlt  :  float64
GarageFinish  :  object
GarageCars  :  int64
GarageArea  :  int64
GarageQual  :  object
GarageCond  :  object
SaleType  :  object


In [161]:
# Bu sefer fonksiyon ile halledelim.
from sklearn.impute import SimpleImputer

def fill_missing_values(test, impute_median_cols, impute_most_frequent_cols, fill_none_cols):
    #medyan ile doldurulacak sütunlar
    imputer_median = SimpleImputer(strategy="median")
    for col in impute_median_cols:
        test[[col]] = imputer_median.fit_transform(test[[col]])
    
    # En sık görülen değer ile doldurulacak sütunlar
    imputer_most_frequent = SimpleImputer(strategy="most_frequent")
    for col in impute_most_frequent_cols:
        test[[col]] = imputer_most_frequent.fit_transform(test[[col]])
    
    # 'None' ile doldurulacak sütunlar
    for col in fill_none_cols:
        test[[col]] = test[[col]].fillna('None')
    
    return test


In [162]:
# Fonksiyon için sütunları ayarlamak lazım.
impute_median_cols = ["LotFrontage","MasVnrArea","BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","TotalBsmtSF" ,"BsmtFullBath" ,"BsmtHalfBath","GarageYrBlt","GarageCars","GarageArea"] #11
impute_most_frequent_cols = ["MSZoning","Utilities","Exterior1st","Exterior2nd","KitchenQual","Functional","SaleType"] # 7
fill_none_cols = ["MasVnrType","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","GarageType","GarageFinish","GarageQual","GarageCond"] #10

In [163]:
test = fill_missing_values(test, impute_median_cols=impute_median_cols , impute_most_frequent_cols=impute_most_frequent_cols,  fill_none_cols=fill_none_cols)

In [164]:
print(test.isna().sum().sum())  # test Verisinde hiç missing value kalmadı.
print(data.isna().sum().sum()) # train verisinde de hiç missing value kalmadı.

0
0


### Encoding

In [165]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [166]:
train_numeric_cols = list(data.select_dtypes(include=['int64', 'float64']).columns)
train_categoric_cols = list(data.select_dtypes(include=['object']).columns)

test_numeric_cols = list(test.select_dtypes(include=['int64', 'float64']).columns)
test_categoric_cols = list(test.select_dtypes(include=['object']).columns)

In [167]:
le = LabelEncoder()
for col in train_categoric_cols:
    data[col] = le.fit_transform(data[col])
print(data)

        Id  MSSubClass  MSZoning  LotFrontage  LotArea  Street  LotShape  \
0        1          60         3         65.0     8450       1         3   
1        2          20         3         80.0     9600       1         3   
2        3          60         3         68.0    11250       1         0   
3        4          70         3         60.0     9550       1         0   
4        5          60         3         84.0    14260       1         0   
...    ...         ...       ...          ...      ...     ...       ...   
1455  1456          60         3         62.0     7917       1         3   
1456  1457          20         3         85.0    13175       1         3   
1457  1458          70         3         66.0     9042       1         3   
1458  1459          20         3         68.0     9717       1         3   
1459  1460          20         3         75.0     9937       1         3   

      LandContour  Utilities  LotConfig  ...  EnclosedPorch  3SsnPorch  \
0            

In [168]:
#####test dataset için
le = LabelEncoder()
for col in test_categoric_cols:
    test[col] = le.fit_transform(test[col])
print(test)

        Id  MSSubClass  MSZoning  LotFrontage  LotArea  Street  LotShape  \
0     1461          20         2         80.0    11622       1         3   
1     1462          20         3         81.0    14267       1         0   
2     1463          60         3         74.0    13830       1         0   
3     1464          60         3         78.0     9978       1         0   
4     1465         120         3         43.0     5005       1         0   
...    ...         ...       ...          ...      ...     ...       ...   
1454  2915         160         4         21.0     1936       1         3   
1455  2916         160         4         21.0     1894       1         3   
1456  2917          20         3        160.0    20000       1         3   
1457  2918          85         3         62.0    10441       1         3   
1458  2919          60         3         74.0     9627       1         3   

      LandContour  Utilities  LotConfig  ...  OpenPorchSF  EnclosedPorch  \
0          

###  Modelling

In [169]:
from sklearn.ensemble import RandomForestRegressor

In [170]:
X_train = data.drop("SalePrice",axis=1)
y_train = data["SalePrice"]

In [171]:
rf = RandomForestRegressor(n_estimators=100, random_state=0)
model = rf.fit(X_train,y_train)

In [172]:
rf_predicts = rf.predict(test)
rf_predicts

array([122348.58, 155648.65, 172589.94, ..., 147669.8 , 109784.58,
       228898.33])

In [173]:
len(rf_predicts)

1459

In [174]:
id_col = test['Id']

output = pd.DataFrame({'Id' : id_col,
                      'SalePrice' :  rf_predicts})
output

Unnamed: 0,Id,SalePrice
0,1461,122348.58
1,1462,155648.65
2,1463,172589.94
3,1464,179886.32
4,1465,197089.91
...,...,...
1454,2915,84941.00
1455,2916,86520.08
1456,2917,147669.80
1457,2918,109784.58


In [175]:
output.to_csv('yarisma.csv',index=False)