In [1]:
import pandas as pd
import numpy as np
sample_submission_df = pd.read_csv("./datasets/sample_submission_zingat.csv")
test_set = pd.read_csv("./datasets/zingat_usecase_testset_null.csv")
train_set = pd.read_csv("./datasets/zingat_usecase_trainset.csv")

## Change Variable

In [2]:
def change_variable(data):
    
    data = data.copy()
    return data.rename(columns = {
        'odasayısı': 'odasayisi',
        'brütm²': 'brutm2',
        'netm²': 'netm2',
        'binayaşı': 'binayasi',
        'bulunduğukat': 'bulundugukat',
        'binadakikatsayısı': 'binadakikatsayisi',
        'isıtmatipi': 'isitmatipi',
        'İnterkom': 'interkom',
        'depremyönetmeliğineuygun': 'depremyonetmeligineuygun',
        'asansör': 'asansor',
        'Çocukoyunalanı': 'cocukoyunalani',
        'giyinmeodası': 'giyinmeodasi',
        'güncel_fiyat': 'guncelfiyat',
    })
    return data

In [3]:
train_set = change_variable(train_set)
test_set = change_variable(test_set)

## Converting The Yes-None Values to 0-1

In [4]:
def convert_values(data):
    
    
    data.loc[data.interkom == "VAR", "interkom"] , data.loc[data.interkom == "YOK", "interkom"] = 1,0
    data.loc[data.depremyonetmeligineuygun == "VAR", "depremyonetmeligineuygun"] , data.loc[data.depremyonetmeligineuygun == "YOK", "depremyonetmeligineuygun"] = 1,0
    data.loc[data.asansor == "VAR", "asansor"] , data.loc[data.asansor == "YOK", "asansor"] = 1,0
    data.loc[data.cocukoyunalani == "VAR", "cocukoyunalani"] , data.loc[data.cocukoyunalani == "YOK", "cocukoyunalani"] = 1,0
    data.loc[data.giyinmeodasi == "VAR", "giyinmeodasi"] , data.loc[data.giyinmeodasi == "YOK", "giyinmeodasi"] = 1,0
    data.loc[data.ebeveynbanyosu == "VAR", "ebeveynbanyosu"] , data.loc[data.ebeveynbanyosu == "YOK", "ebeveynbanyosu"] = 1,0
    
    return data

In [5]:
train_set = convert_values(train_set)
test_set = convert_values(test_set)

## Convert Price

In [6]:
def convert_price(data):
    
    data["guncelfiyat"] = data.guncelfiyat.str[:-3]
    data["guncelfiyat"] = data["guncelfiyat"].astype(str).astype(int)
    return data

In [7]:
train_set = convert_price(train_set)

In [8]:
def tarih_fonksiyonu(data):
   
    data = data.copy()
    data[['yil','ay','gun']] = data["tarih"].str.split("-",expand=True)
   
    data["yil"] = data["yil"].astype(str).astype(int)
    data["ay"] = data["ay"].astype(str).astype(int)
    data["gun"] = data["gun"].astype(str).astype(int)
    return data
   

In [9]:
train_set = tarih_fonksiyonu(train_set)
test_set = tarih_fonksiyonu(test_set)

In [10]:
def belirtilmemis_fonksiyonu(data):
    
    data = data.copy()
    data['manzara'] = data['manzara'].replace("-","0")
    
    return data

In [11]:
train_set = belirtilmemis_fonksiyonu(train_set)
test_set = belirtilmemis_fonksiyonu(test_set)

In [12]:
def path_split_fonksiyonu(data):
    
    data = data.copy()
    data[['il','ilce','mahalle']] = data["path"].str.split("/",expand=True)
    
    return data

In [13]:
train_set = path_split_fonksiyonu(train_set)
test_set = path_split_fonksiyonu(test_set)

In [14]:
def odasayisi_fonksiyonu_train(data):
   
    data = data.copy()
    data = data.drop(data.index[data['odasayisi'] == '-'], inplace = False)   
    data[['odasayisi','salonsayisi']] = data['odasayisi'].str.split("+",expand=True)
    data["odasayisi"] = data["odasayisi"].astype(str).astype(int)
    data["salonsayisi"] = data["salonsayisi"].astype(str).astype(int)
    
    return data

In [15]:
def odasayisi_fonksiyonu_test(data):
   
    data = data.copy()
    data['odasayisi'] = data['odasayisi'].replace("-","0+0")  
    data[['odasayisi','salonsayisi']] = data['odasayisi'].str.split("+",expand=True)
    data["odasayisi"] = data["odasayisi"].astype(str).astype(int)
    data["salonsayisi"] = data["salonsayisi"].astype(str).astype(int)
    
    return data

In [16]:
train_set = odasayisi_fonksiyonu_train(train_set)
test_set = odasayisi_fonksiyonu_test(test_set)

In [17]:
label_encoder_cols = ("il","ilce","mahalle","emlaktipi","binayasi","bulundugukat","binadakikatsayisi","isitmatipi","otopark","manzara")

In [18]:
from sklearn.preprocessing import LabelEncoder
def label_encoder_fonksiyonu(data,cols):
    
    for col_name in label_encoder_cols:
        labelEncoder = LabelEncoder()
        labelEncoder.fit(data[col_name].values)
        data[col_name] = labelEncoder.transform(data[col_name].values)
        
    return data

In [19]:
train_set = label_encoder_fonksiyonu(train_set,label_encoder_cols)
test_set = label_encoder_fonksiyonu(test_set,label_encoder_cols)

In [20]:
def banyosayisi_fonksiyonu_train(data):
    
    data = data.copy()
    data = data.drop(data.index[data['banyosayisi'] == '-'], inplace = False)
    data = data.drop(data.index[data['banyosayisi'] == '6 ve üzeri'], inplace = False)   

    #data['banyosayisi'] = data['banyosayisi'].replace("-","-1")
    data['banyosayisi'] = data['banyosayisi'].astype(str).astype(int)
    
    return data

In [21]:
def banyosayisi_fonksiyonu_test(data):
    
    data = data.copy()
    # data = data.drop(data.index[data['banyosayisi'] == '-'], inplace = False)
    # data = data.drop(data.index[data['banyosayisi'] == '6 ve üzeri'], inplace = False)   
    data['banyosayisi'] = data['banyosayisi'].replace("-","7")
    data['banyosayisi'] = data['banyosayisi'].replace("6 ve üzeri","6")
    data['banyosayisi'] = data['banyosayisi'].astype(str).astype(int)
    
    return data

In [22]:
train_set = banyosayisi_fonksiyonu_train(train_set)
test_set = banyosayisi_fonksiyonu_test(test_set)

In [23]:
def object_to_int(data):
    
    data = data.copy()
    data[['interkom','depremyonetmeligineuygun','asansor','cocukoyunalani','giyinmeodasi','ebeveynbanyosu']] = data[['interkom',
                                                                                                                     'depremyonetmeligineuygun',
                                                                                                                     'asansor','cocukoyunalani',
                                                                                                                     'giyinmeodasi','ebeveynbanyosu']].astype(str).astype(int)
    
    return data

In [24]:
train_set = object_to_int(train_set)
test_set = object_to_int(test_set)

In [25]:
train_set = train_set.drop(['tarih','path'], axis = 1)
test_set = test_set.drop(['tarih','path'], axis = 1)

## Core - 2

In [26]:
# train_set = train_set.drop(['interkom', 'asansor'], axis = 1)
# test_set = test_set.drop(['interkom', 'asansor'], axis = 1)

In [32]:
train_set = train_set.drop(train_set[train_set['guncelfiyat'] > 10000000].index)
train_set = train_set.drop(train_set[train_set['odasayisi'] > 50].index)
train_set = train_set.drop(train_set[train_set['brutm2'] > 1000].index)
train_set = train_set.drop(train_set[train_set['netm2'] > 1000].index)

In [29]:
#pip install scipy

In [30]:
# train_set.guncelfiyat
# train_set.describe().astype(int)

In [31]:
# from scipy import stats
# import numpy as np
# z = np.abs(stats.zscore(train_set))
# print(z)

In [32]:
# train_set = train_set[(z < 3).all(axis = 1)]

In [33]:
#train_set

In [36]:
train_set.to_csv('./datasets/final_train_set.csv')
test_set.to_csv('./datasets/final_test_set.csv')

In [34]:
train_set.describe().astype(int)

Unnamed: 0,guncelfiyat,odasayisi,brutm2,netm2,emlaktipi,binayasi,bulundugukat,binadakikatsayisi,isitmatipi,banyosayisi,...,giyinmeodasi,ebeveynbanyosu,Ilan_ID,yil,ay,gun,il,ilce,mahalle,salonsayisi
count,63468,63468,63468,63468,63468,63468,63468,63468,63468,63468,...,63468,63468,63468,63468,63468,63468,63468,63468,63468,63468
mean,699621,2,138,117,0,4,14,6,8,1,...,0,0,32245,2019,5,15,0,15,280,1
std,1001123,0,71,55,1,4,9,2,2,0,...,0,0,18615,0,3,8,0,8,184,0
min,172,1,1,1,0,0,0,0,0,0,...,0,0,1,2019,1,1,0,0,0,0
25%,240000,2,100,88,0,1,12,6,8,1,...,0,0,16153,2019,2,9,0,7,132,1
50%,375000,3,125,107,0,2,14,7,9,1,...,0,0,32226,2019,5,16,0,15,240,1
75%,715000,3,150,133,0,7,20,8,9,2,...,0,1,48384,2019,9,23,1,23,459,1
max,10000000,10,1000,999,8,14,34,12,16,5,...,1,1,64573,2020,12,31,1,34,649,4


In [35]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63468 entries, 0 to 64572
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   guncelfiyat               63468 non-null  int32  
 1   odasayisi                 63468 non-null  int32  
 2   brutm2                    63468 non-null  float64
 3   netm2                     63468 non-null  float64
 4   emlaktipi                 63468 non-null  int32  
 5   binayasi                  63468 non-null  int32  
 6   bulundugukat              63468 non-null  int32  
 7   binadakikatsayisi         63468 non-null  int32  
 8   isitmatipi                63468 non-null  int32  
 9   banyosayisi               63468 non-null  int32  
 10  manzara                   63468 non-null  int32  
 11  otopark                   63468 non-null  int32  
 12  interkom                  63468 non-null  int32  
 13  depremyonetmeligineuygun  63468 non-null  int32  
 14  asanso