This notebook analyzes house prices in King County using data science techniques.
It includes data cleaning, visualization, and basic price prediction.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Ridge, Lasso, ElasticNet

In [None]:
df=pd.read_csv("kchouse.csv")

In [None]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [None]:
df.isnull().sum()

Unnamed: 0,0
id,0
date,0
price,0
bedrooms,0
bathrooms,0
sqft_living,0
sqft_lot,0
floors,0
waterfront,0
view,0


In [None]:
df.corr(numeric_only=True)

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
id,1.0,-0.016762,0.001286,0.00516,-0.012258,-0.132109,0.018525,-0.002721,0.011592,-0.023783,0.00813,-0.010842,-0.005151,0.02138,-0.016907,-0.008224,-0.001891,0.020799,-0.002901,-0.138798
price,-0.016762,1.0,0.30835,0.525138,0.702035,0.089661,0.256794,0.266369,0.397293,0.036362,0.667434,0.605567,0.323816,0.054012,0.126434,-0.053203,0.307003,0.021626,0.585379,0.082447
bedrooms,0.001286,0.30835,1.0,0.515884,0.576671,0.031703,0.175429,-0.006582,0.079532,0.028472,0.356967,0.4776,0.303093,0.154178,0.018841,-0.152668,-0.008931,0.129473,0.391638,0.029244
bathrooms,0.00516,0.525138,0.515884,1.0,0.754665,0.08774,0.500653,0.063744,0.187737,-0.124982,0.664983,0.685342,0.28377,0.506019,0.050739,-0.203866,0.024573,0.223042,0.568634,0.087175
sqft_living,-0.012258,0.702035,0.576671,0.754665,1.0,0.172826,0.353949,0.103818,0.284611,-0.058753,0.762704,0.876597,0.435043,0.318049,0.055363,-0.19943,0.052529,0.240223,0.75642,0.183286
sqft_lot,-0.132109,0.089661,0.031703,0.08774,0.172826,1.0,-0.005201,0.021604,0.07471,-0.008958,0.113621,0.183512,0.015286,0.05308,0.007644,-0.129574,-0.085683,0.229521,0.144608,0.718557
floors,0.018525,0.256794,0.175429,0.500653,0.353949,-0.005201,1.0,0.023698,0.029444,-0.263768,0.458183,0.523885,-0.245705,0.489319,0.006338,-0.059121,0.049614,0.125419,0.279885,-0.011269
waterfront,-0.002721,0.266369,-0.006582,0.063744,0.103818,0.021604,0.023698,1.0,0.401857,0.016653,0.082775,0.072075,0.080588,-0.026161,0.092885,0.030285,-0.014274,-0.04191,0.086463,0.030703
view,0.011592,0.397293,0.079532,0.187737,0.284611,0.07471,0.029444,0.401857,1.0,0.04599,0.251321,0.167649,0.276947,-0.05344,0.103917,0.084827,0.006157,-0.0784,0.280439,0.072575
condition,-0.023783,0.036362,0.028472,-0.124982,-0.058753,-0.008958,-0.263768,0.016653,0.04599,1.0,-0.144674,-0.158214,0.174105,-0.361417,-0.060618,0.003026,-0.014941,-0.1065,-0.092824,-0.003406


In [None]:
abs(df.corr(numeric_only=True)["price"]).sort_values(ascending=False)# Sayısal sütunlar arasındaki korelasyonu hesaplar, price ile olan ilişkilerin mutlak değerini alır ve büyükten küçüğe sıralar.

Unnamed: 0,price
price,1.0
sqft_living,0.702035
grade,0.667434
sqft_above,0.605567
sqft_living15,0.585379
bathrooms,0.525138
view,0.397293
sqft_basement,0.323816
bedrooms,0.30835
lat,0.307003


In [None]:
outliers=df.quantile(q=.97,numeric_only=True) # Sayısal sütunlar için %97 persentil değerlerini alır (üst uç değer sınırı)
df=df[(df["price"]<outliers["price"])] # price sütunundaki uç değerleri filtreler
df=df[(df["bedrooms"]<outliers["bedrooms"])]  # bedrooms sütunundaki uç değerleri filtreler
df=df[(df["bathrooms"]<outliers["bathrooms"])] # bathrooms sütunundaki uç değerleri filtreler
df=df[(df["sqft_living"]<outliers["sqft_living"])] # sqft_living sütunundaki uç değerleri filtreler

In [None]:
df["zipcode"]=df["zipcode"].astype("category") # zipcode sütununu kategorik veri tipine dönüştürür

In [None]:
df["yr_renovated"].unique() # aşağıda map kkullanmadık çünkü where değerleri tek tek yazmak yerine istediğin değere atar. map te tek tek yazmalısın. where fonk. numpy ile çağrılır.

array([   0, 1991, 2002, 2010, 2013, 1978, 2005, 2003, 1984, 1954, 2014,
       2011, 1983, 1945, 1988, 1957, 1977, 1981, 1995, 2000, 1999, 1989,
       1986, 1994, 2009, 2004, 2007, 1987, 1973, 2006, 2001, 1980, 1992,
       1971, 1979, 1997, 1950, 1969, 1948, 1970, 1974, 2008, 2012, 1968,
       2015, 1951, 1993, 1972, 1953, 1998, 1996, 1982, 1990, 1985, 1956,
       1963, 1940, 1976, 1946, 1975, 1964, 1955, 1962, 1958, 1959, 1960,
       1967, 1965, 1934, 1944])

In [None]:
df["age"]=datetime.datetime.now().year-df["yr_built"] # Evin yaşını hesaplar
del(df["yr_built"])
df["yr_renovated"]=np.where(df["yr_renovated"]>0,1,0) # Yenilenmiş mi bilgisini ikili (0/1) hale getirir
df["sqft_basement"]=np.where(df["sqft_basement"]>0,1,0) # Bodrum var mı bilgisini ikili (0/1) hale getirir

In [None]:
df["bedrooms"]=df["bedrooms"]**2 # bedrooms değerlerinin karesini alır
df["bathrooms"]=df["bathrooms"]**2
df["sqft_living"]=df["sqft_living"]**2

In [None]:
x=df.drop(['id','date','lat','long','price'], axis=1)
y=df[['price']]

In [None]:
x=pd.get_dummies(x,drop_first=True)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42)# Train test split

In [None]:
#Lineer Regression: “X artarsa Y nasıl değişir?” sorusuna cevap verir.
lr=LinearRegression()
lr.fit(x_train,y_train)
tahmin=lr.predict(x_test)
print(r2_score(tahmin,y_test))
mean_squared_error(tahmin,y_test)

0.8047829485717316


7646312119.098292

In [None]:
#Ridge regression: Linear Regression’ın overfitting’i azaltan versiyonudur. Linear Regression’a ceza (penalty) ekler
r=Ridge()
r.fit(x_train,y_train)
tahmin=r.predict(x_test)
print(r2_score(tahmin,y_test))
mean_squared_error(tahmin,y_test)

0.8017091089190566


  return f(*arrays, *other_args, **kwargs)


7683348861.554926

In [None]:
#Lasso regression: Linear Regression’ın özellik seçimi yapan versiyonudur.
l=Lasso()
l.fit(x_train,y_train)
tahmin=l.predict(x_test)
print(r2_score(tahmin,y_test))
mean_squared_error(tahmin,y_test)

0.804634883580415


7646932093.729244