In [2]:
import pandas as pd
import numpy as np
sample_submission_df = pd.read_csv("./datasets/sample_submission_zingat.csv")
test_set = pd.read_csv("./datasets/zingat_usecase_testset_null.csv")
train_set = pd.read_csv("./datasets/zingat_usecase_trainset.csv")

## Change Variable

In [3]:
def change_variable(data):
    
    data = data.copy()
    return data.rename(columns = {
        'odasayısı': 'odasayisi',
        'brütm²': 'brutm2',
        'netm²': 'netm2',
        'binayaşı': 'binayasi',
        'bulunduğukat': 'bulundugukat',
        'binadakikatsayısı': 'binadakikatsayisi',
        'isıtmatipi': 'isitmatipi',
        'İnterkom': 'interkom',
        'depremyönetmeliğineuygun': 'depremyonetmeligineuygun',
        'asansör': 'asansor',
        'Çocukoyunalanı': 'cocukoyunalani',
        'giyinmeodası': 'giyinmeodasi',
        'güncel_fiyat': 'guncelfiyat',
    })
    return data

In [4]:
train_set = change_variable(train_set)
test_set = change_variable(test_set)

## Converting The Yes-None Values to 0-1

In [5]:
def convert_values(data):
    
    
    data.loc[data.interkom == "VAR", "interkom"] , data.loc[data.interkom == "YOK", "interkom"] = 1,0
    data.loc[data.depremyonetmeligineuygun == "VAR", "depremyonetmeligineuygun"] , data.loc[data.depremyonetmeligineuygun == "YOK", "depremyonetmeligineuygun"] = 1,0
    data.loc[data.asansor == "VAR", "asansor"] , data.loc[data.asansor == "YOK", "asansor"] = 1,0
    data.loc[data.cocukoyunalani == "VAR", "cocukoyunalani"] , data.loc[data.cocukoyunalani == "YOK", "cocukoyunalani"] = 1,0
    data.loc[data.giyinmeodasi == "VAR", "giyinmeodasi"] , data.loc[data.giyinmeodasi == "YOK", "giyinmeodasi"] = 1,0
    data.loc[data.ebeveynbanyosu == "VAR", "ebeveynbanyosu"] , data.loc[data.ebeveynbanyosu == "YOK", "ebeveynbanyosu"] = 1,0
    
    return data

In [6]:
train_set = convert_values(train_set)
test_set = convert_values(test_set)

## Convert Price

In [7]:
def convert_price(data):
    
    data["guncelfiyat"] = data.guncelfiyat.str[:-3]
    data["guncelfiyat"] = data["guncelfiyat"].astype(str).astype(int)
    return data

In [8]:
train_set = convert_price(train_set)

In [9]:
def tarih_fonksiyonu(data):
   
    data = data.copy()
    data[['yil','ay','gun']] = data["tarih"].str.split("-",expand=True)
   
    data["yil"] = data["yil"].astype(str).astype(int)
    data["ay"] = data["ay"].astype(str).astype(int)
    data["gun"] = data["gun"].astype(str).astype(int)
    return data
   

In [10]:
train_set = tarih_fonksiyonu(train_set)
test_set = tarih_fonksiyonu(test_set)

In [11]:
def belirtilmemis_fonksiyonu(data):
    
    data = data.copy()
    data['manzara'] = data['manzara'].replace("-","0")
    
    return data

In [12]:
train_set = belirtilmemis_fonksiyonu(train_set)
test_set = belirtilmemis_fonksiyonu(test_set)

In [13]:
def path_split_fonksiyonu(data):
    
    data = data.copy()
    data[['il','ilce','mahalle']] = data["path"].str.split("/",expand=True)
    
    return data

In [14]:
train_set = path_split_fonksiyonu(train_set)
test_set = path_split_fonksiyonu(test_set)

In [15]:
def odasayisi_fonksiyonu_train(data):
   
    data = data.copy()
    data = data.drop(data.index[data['odasayisi'] == '-'], inplace = False)   
    data[['odasayisi','salonsayisi']] = data['odasayisi'].str.split("+",expand=True)
    data["odasayisi"] = data["odasayisi"].astype(str).astype(int)
    data["salonsayisi"] = data["salonsayisi"].astype(str).astype(int)
    
    return data

In [16]:
def odasayisi_fonksiyonu_test(data):
   
    data = data.copy()
    data['odasayisi'] = data['odasayisi'].replace("-","0+0")  
    data[['odasayisi','salonsayisi']] = data['odasayisi'].str.split("+",expand=True)
    data["odasayisi"] = data["odasayisi"].astype(str).astype(int)
    data["salonsayisi"] = data["salonsayisi"].astype(str).astype(int)
    
    return data

In [17]:
train_set = odasayisi_fonksiyonu_train(train_set)
test_set = odasayisi_fonksiyonu_test(test_set)

In [18]:
label_encoder_cols = ("il","ilce","mahalle","emlaktipi","binayasi","bulundugukat","binadakikatsayisi","isitmatipi","otopark","manzara")

In [19]:
from sklearn.preprocessing import LabelEncoder
def label_encoder_fonksiyonu(data,cols):
    
    for col_name in label_encoder_cols:
        labelEncoder = LabelEncoder()
        labelEncoder.fit(data[col_name].values)
        data[col_name] = labelEncoder.transform(data[col_name].values)
        
    return data

In [20]:
train_set = label_encoder_fonksiyonu(train_set,label_encoder_cols)
test_set = label_encoder_fonksiyonu(test_set,label_encoder_cols)

In [21]:
def banyosayisi_fonksiyonu_train(data):
    
    data = data.copy()
    data = data.drop(data.index[data['banyosayisi'] == '-'], inplace = False)
    data = data.drop(data.index[data['banyosayisi'] == '6 ve üzeri'], inplace = False)   

    #data['banyosayisi'] = data['banyosayisi'].replace("-","-1")
    data['banyosayisi'] = data['banyosayisi'].astype(str).astype(int)
    
    return data

In [22]:
def banyosayisi_fonksiyonu_test(data):
    
    data = data.copy()
    # data = data.drop(data.index[data['banyosayisi'] == '-'], inplace = False)
    # data = data.drop(data.index[data['banyosayisi'] == '6 ve üzeri'], inplace = False)   
    data['banyosayisi'] = data['banyosayisi'].replace("-","7")
    data['banyosayisi'] = data['banyosayisi'].replace("6 ve üzeri","6")
    data['banyosayisi'] = data['banyosayisi'].astype(str).astype(int)
    
    return data

In [23]:
train_set = banyosayisi_fonksiyonu_train(train_set)
test_set = banyosayisi_fonksiyonu_test(test_set)

In [24]:
def object_to_int(data):
    
    data = data.copy()
    data[['interkom','depremyonetmeligineuygun','asansor','cocukoyunalani','giyinmeodasi','ebeveynbanyosu']] = data[['interkom',
                                                                                                                     'depremyonetmeligineuygun',
                                                                                                                     'asansor','cocukoyunalani',
                                                                                                                     'giyinmeodasi','ebeveynbanyosu']].astype(str).astype(int)
    
    return data

In [25]:
train_set = object_to_int(train_set)
test_set = object_to_int(test_set)

In [26]:
train_set = train_set.drop(['tarih','path'], axis = 1)
test_set = test_set.drop(['tarih','path'], axis = 1)

## Core - 2

In [27]:
# train_set = train_set.drop(['interkom', 'asansor'], axis = 1)
# test_set = test_set.drop(['interkom', 'asansor'], axis = 1)

In [70]:
train_set.drop(train_set[train_set['guncelfiyat'] > 10000000].index)
train_set.drop(train_set[train_set['odasayisi'] > 50].index)
train_set.drop(train_set[train_set['brutm2'] > 1000].index)
train_set.drop(train_set[train_set['netm2'] > 1000].index)

Unnamed: 0,guncelfiyat,odasayisi,brutm2,netm2,emlaktipi,binayasi,bulundugukat,binadakikatsayisi,isitmatipi,banyosayisi,...,giyinmeodasi,ebeveynbanyosu,Ilan_ID,yil,ay,gun,il,ilce,mahalle,salonsayisi
0,1050000,5,380.0,370.0,0,4,4,3,9,3,...,1,1,1,2019,1,1,0,7,19,2
1,547500,2,65.0,55.0,0,1,1,10,9,1,...,0,0,11,2019,1,1,0,15,183,1
2,440000,3,205.0,197.0,0,3,12,3,9,2,...,0,1,10,2019,1,1,0,7,19,1
3,1450000,3,150.0,130.0,0,7,14,8,12,2,...,0,1,8,2019,1,1,0,8,396,1
4,169000,2,70.0,65.0,0,2,26,9,9,1,...,1,1,7,2019,1,1,0,7,132,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64568,790000,3,180.0,174.0,7,1,30,6,8,2,...,0,0,64558,2020,4,19,1,26,98,1
64569,340000,3,130.0,125.0,0,14,12,7,5,2,...,0,0,64557,2020,4,19,1,16,503,1
64570,620000,3,180.0,175.0,5,1,30,6,8,2,...,0,0,64556,2020,4,19,1,26,98,1
64571,750000,3,125.0,115.0,0,1,1,9,10,2,...,1,1,64555,2020,4,19,1,18,163,1


In [29]:
#pip install scipy

In [30]:
# train_set.guncelfiyat
# train_set.describe().astype(int)

In [31]:
# from scipy import stats
# import numpy as np
# z = np.abs(stats.zscore(train_set))
# print(z)

In [32]:
# train_set = train_set[(z < 3).all(axis = 1)]

In [33]:
#train_set

In [34]:
train_set.to_csv('./datasets/final_train_set.csv')
test_set.to_csv('./datasets/final_test_set.csv')

In [53]:
train_set.describe()

Unnamed: 0,guncelfiyat,odasayisi,brutm2,netm2,emlaktipi,binayasi,bulundugukat,binadakikatsayisi,isitmatipi,banyosayisi,...,giyinmeodasi,ebeveynbanyosu,Ilan_ID,yil,ay,gun,il,ilce,mahalle,salonsayisi
count,64224.0,64224.0,64224.0,64224.0,64224.0,64224.0,64224.0,64224.0,64224.0,64224.0,...,64224.0,64224.0,64224.0,64224.0,64224.0,64224.0,64224.0,64224.0,64224.0,64224.0
mean,899852.4,2.905798,184.137877,126.55048,0.578055,4.365549,14.903074,6.688574,8.927161,1.548487,...,0.134809,0.322014,32266.456013,2019.246886,5.612715,15.939182,0.486251,15.249891,281.131898,1.050931
std,3597979.0,16.338385,2678.636384,767.058462,1.615331,4.414114,9.156972,2.874217,2.79084,0.737624,...,0.341523,0.467252,18636.336495,0.431203,3.640276,8.703065,0.499815,8.66149,184.447825,0.249204
min,172.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,2019.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,240000.0,2.0,100.0,89.0,0.0,1.0,12.0,6.0,8.0,1.0,...,0.0,0.0,16135.75,2019.0,2.0,9.0,0.0,7.0,132.0,1.0
50%,380000.0,3.0,125.0,110.0,0.0,2.0,14.0,7.0,9.0,1.0,...,0.0,0.0,32254.5,2019.0,5.0,16.0,0.0,15.0,242.0,1.0
75%,740000.0,3.0,154.0,135.0,0.0,7.0,20.0,8.0,9.0,2.0,...,0.0,1.0,48416.25,2019.0,9.0,23.0,1.0,23.0,459.0,1.0
max,510000000.0,1149.0,370000.0,145000.0,8.0,14.0,34.0,12.0,16.0,5.0,...,1.0,1.0,64573.0,2020.0,12.0,31.0,1.0,34.0,649.0,4.0


In [61]:
train_set.brutm2.describe()

count     64224.000000
mean        184.137877
std        2678.636384
min           1.000000
25%         100.000000
50%         125.000000
75%         154.000000
max      370000.000000
Name: brutm2, dtype: float64