## Data collection

### read and filter dataset

In [1]:
# read and filter dataset
df = pd.read_csv('Datasets/Rents & Transactions/transactions.csv', sep = ';')

# dates feats
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'], errors = 'coerce', dayfirst = True)
df['dt_month'] = pd.to_datetime(df['Transaction Date'].astype(str).str[:7])
df['dt_year'] = df['Transaction Date'].astype(str).str[:4].astype(int)

df['tr_year_num'] = df['dt_year']
df['tr_month_num'] = df['dt_month'].dt.month

# filtering
# df[df.duplicated(subset = ['Transaction Number'])]
df.drop_duplicates(subset = ['Transaction Number'], keep = 'first', inplace = True)

f1 = df['Transaction Date'] > '2012-01-01'
f2 = df['Transaction Type'] == 'Sales'
filt = f1&f2
print(df.shape)
df = df[filt].sort_values('Transaction Date').reset_index(drop = True)
print(df.shape)

(911106, 27)
(484311, 27)


In [2]:
# last property id value
feat = 'Property ID'
df[f'{feat}_last'] = df.groupby([feat]).Amount.shift(1)

### cat to num or flags

In [3]:
# Room(s)
feat = 'Room(s)'
display(df[feat].value_counts())
df.loc[df[feat] == 'PENTHOUSE', 'Property Sub Type'] = 'PENTHOUSE'

def find_num(x):
    x1 = str(x).split(' B/R')
    if len(x1) > 1:
        return int(x1[0])
    return np.nan
    
repl1 = {k: find_num(k) for k in df[feat][~df[feat].isnull()].unique()}
repl1['Single Room'] = -1
repl1['Studio'] =  0

df[feat] = df[feat].map(repl1)

Room(s)
1 B/R          127065
2 B/R           87329
Studio          81541
3 B/R           52704
Office          25598
4 B/R           16542
Shop             5379
5 B/R            1617
PENTHOUSE         745
Single Room       294
6 B/R             141
Store Room         33
7 B/R              24
Hotel               7
GYM                 6
8 B/R               2
Name: count, dtype: int64

In [4]:
# flags
feat = 'Usage'
repl1 = {'Residential': 1}
df[feat] = df[feat].map(repl1).fillna(0).astype(int)

feat = 'Registration type'
repl1 = {'Ready': 1}
df[feat] = df[feat].map(repl1).fillna(0).astype(int)

feat = 'Is Free Hold?'
repl1 = {'Free Hold': 1}
df[feat] = df[feat].map(repl1).fillna(0).astype(int)

feat = 'Parking'
df[feat] = df[feat].notnull().astype(int)

### cats cleaning - unpopular as null

In [5]:
cnt_min = 99

In [6]:
# Property Type
feat = 'Property Type'
repl1 = {}
df[feat] = df[feat].fillna('').str.strip().map(lambda x: repl1[x] if x in repl1 else x)
df[feat].value_counts()

Property Type
Unit        359012
Land         82583
Building     42716
Name: count, dtype: int64

In [7]:
# Property Sub Type
feat = 'Property Sub Type'
df[feat] = df['Property Type'] + '_' + df[feat].fillna('')

repl1 = {}
df[feat] = df[feat].fillna('').str.strip().map(lambda x: repl1[x] if x in repl1 else x)

repl1 = df[feat].value_counts()
repl1 = repl1[repl1 > cnt_min]
df[feat] = df[feat].map(lambda x: x if x in repl1.index else '')
df[feat].value_counts()

Property Sub Type
Unit_Flat                  299139
Building_Villa              42616
Land_Residential            42508
Unit_Office                 26607
Land_Commercial             23953
Unit_Hotel Apartment        17270
Unit_Hotel Rooms             8920
Land_                        7158
Unit_Shop                    5641
Land_Residential Flats       1773
Land_Villa                   1736
Land_Land                    1284
Land_Industrial              1022
Land_General Use              880
Unit_PENTHOUSE                745
Land_Airport                  420
                              408
Land_Unit                     400
Land_Labor Camp               318
Land_Government Housing       316
Land_Sports Club              305
Unit_Stacked Townhouses       270
Unit_Workshop                 211
Land_Agricultural             195
Land_Warehouse                116
Building_Building             100
Name: count, dtype: int64

In [8]:
# Nearest Landmark
feat = 'Nearest Landmark'
repl1 = {}
df[feat] = df[feat].fillna('').str.strip().map(lambda x: repl1[x] if x in repl1 else x)

repl1 = df[feat].value_counts()
repl1 = repl1[repl1 > cnt_min]
df[feat] = df[feat].map(lambda x: x if x in repl1.index else '')
df[feat].value_counts()

Nearest Landmark
Sports City Swimming Academy         103307
Burj Al Arab                          79980
Downtown Dubai                        72086
                                      49371
Motor City                            37189
IMG World Adventures                  32561
Burj Khalifa                          26884
Dubai Cycling Course                  24681
Dubai International Airport           22804
Expo 2020 Site                        20995
Global Village                         4344
Al Makhtoum International Airport      3867
Dubai Parks and Resorts                3168
Hamdan Sports Complex                  3074
Name: count, dtype: int64

In [9]:
# Nearest Mall
feat = 'Nearest Mall'
repl1 = {}
df[feat] = df[feat].fillna('').str.strip().map(lambda x: repl1[x] if x in repl1 else x)

repl1 = df[feat].value_counts()
repl1 = repl1[repl1 > cnt_min]
df[feat] = df[feat].map(lambda x: x if x in repl1.index else '')
df[feat].value_counts()

Nearest Mall
Marina Mall             146364
                        121528
Dubai Mall              104383
Mall of the Emirates     44436
City Centre Mirdif       39872
Ibn-e-Battuta Mall       27728
Name: count, dtype: int64

In [10]:
# Nearest Metro
feat = 'Nearest Metro'
repl1 = {
    'Jumeirah Beach Resdency': 'Jumeirah Beach Residency'
}
df[feat] = df[feat].fillna('').str.strip().map(lambda x: repl1[x] if x in repl1 else x)

repl1 = df[feat].value_counts()
repl1 = repl1[repl1 > cnt_min]
df[feat] = df[feat].map(lambda x: x if x in repl1.index else '')
df[feat].value_counts()

Nearest Metro
                                        120606
Buj Khalifa Dubai Mall Metro Station     48572
Business Bay Metro Station               39306
Nakheel Metro Station                    32801
Dubai Internet City                      27680
Rashidiya Metro Station                  27639
Damac Properties                         24042
Jumeirah Lakes Towers                    23169
Jumeirah Beach Residency                 19367
First Abu Dhabi Bank Metro Station       13514
Sharaf Dg Metro Station                  11409
Ibn Battuta Metro Station                10626
Harbour Tower                            10421
Creek Metro Station                       9842
Mina Seyahi                               9827
Palm Jumeirah                             9229
Marina Towers                             6250
Noor Bank Metro Station                   5457
DANUBE Metro Station                      5335
Marina Mall Metro Station                 4098
Dubai Marina                              3854

In [11]:
# Project
feat = 'Project'
print(df[feat].nunique())
df[feat] = df[feat].str.lower().str.strip()
print(df[feat].nunique())

repl1 = df[feat].value_counts()
repl1 = repl1[repl1 > cnt_min]
df[feat] = df[feat].map(lambda x: x if x in repl1.index else '')
print(df[feat].nunique())
df[feat].value_counts()

1411
1411
921


Project
                                  140441
remraam                             4421
sky courts                          2576
international city emarati          2565
al khail heights                    2344
                                   ...  
canal front residences-cf3&cf4       102
azizi riviera 18                     101
city center residences               101
polo homes                           100
sahara meadows2                      100
Name: count, Length: 921, dtype: int64

In [12]:
# Project
feat = 'Area'
print(df[feat].nunique())
df[feat] = df[feat].str.lower().str.strip()
print(df[feat].nunique())

repl1 = df[feat].value_counts()
repl1 = repl1[repl1 > cnt_min]
df[feat] = df[feat].map(lambda x: x if x in repl1.index else '')
print(df[feat].nunique())
df[feat].value_counts()

305
299
160


Area
business bay                         41177
dubai marina                         34254
burj khalifa                         28358
jumeirah village circle              26535
jumeirah lakes towers                26527
international city ph 1              21628
dubai sports city                    17275
palm jumeirah                        14650
silicon oasis                        13647
dubai hills                          11442
akoya oxygen                          9934
al furjan                             9577
dubai creek harbour                   9390
damac hills                           8696
town square                           7699
dubai production city                 7082
arjan                                 7001
the greens                            6571
jumeirah beach residence              6514
dubai investment park first           6461
emirate living                        5940
mira                                  5628
jumeirah village triangle             5552
remraa

### cats encoding

In [13]:
# categorical feats encoding
feats_cat = [    
    'Property Type',
    'Property Sub Type',
    'Area',
    'Nearest Metro',
    'Nearest Mall',
    'Nearest Landmark',
    'Project',
]

# label encoder
for feat in feats_cat:
    lbl = preprocessing.LabelEncoder()
    df[f'{feat}_lbl'] = lbl.fit_transform(df[feat])
    

# one-hot maybe..
# lbl = preprocessing.OneHotEncoder(sparse_output=False, drop='first')
# df = pd.concat([df, pd.DataFrame(lbl.fit_transform(df[[f+'_lbl']]), columns = lbl.get_feature_names_out())], axis = 1)
# del df[f+'_lbl']

In [14]:
# target encoders by average value in the previous month
for feat in feats_cat[1:]:
    feats_gr = ['dt_month', 'Property Type', feat]
    df1 = df.groupby(feats_gr).Amount.mean().reset_index()
    df1[f'{feat}_trg'] = df1.groupby(feats_gr[1:]).Amount.shift(1)
    del df1['Amount']    
    df = df.merge(df1, how = 'left')

In [15]:
df[df['Transaction Date'] < '2022-06-01'].to_pickle('data1.pkl')
print(df.shape)
df[:5]

(484311, 41)


Unnamed: 0,Transaction Number,Transaction Date,Property ID,Transaction Type,Transaction sub type,Registration type,Is Free Hold?,Usage,Area,Property Type,Property Sub Type,Amount,Transaction Size (sq.m),Property Size (sq.m),Room(s),Parking,Nearest Metro,Nearest Mall,Nearest Landmark,No. of Buyer,No. of Seller,Master Project,Project,dt_month,dt_year,tr_year_num,tr_month_num,Property ID_last,Property Type_lbl,Property Sub Type_lbl,Area_lbl,Nearest Metro_lbl,Nearest Mall_lbl,Nearest Landmark_lbl,Project_lbl,Property Sub Type_trg,Area_trg,Nearest Metro_trg,Nearest Mall_trg,Nearest Landmark_trg,Project_trg
0,11-1-2012,2012-01-02 08:16:00,236028,Sales,Sale,1,0,0,al suq al kabeer,Land,Land_Commercial,2400000.0,201.79,201.79,,0,Al Ghubaiba Metro Station,Dubai Mall,Burj Khalifa,1.0,1.0,,,2012-01-01,2012,2012,1,,1,6,31,6,2,3,0,,,,,,
1,11-2-2012,2012-01-02 08:22:00,90111,Sales,Sale,1,0,1,mirdif,Land,Land_Residential,2784375.0,940.64,940.64,,0,Rashidiya Metro Station,City Centre Mirdif,Dubai International Airport,1.0,1.0,,,2012-01-01,2012,2012,1,,1,12,116,43,1,6,0,,,,,,
2,41-8804-2010,2012-01-02 08:35:00,738093,Sales,Delayed Sell,1,1,1,palm jumeirah,Unit,Unit_Flat,2662200.0,190.05,190.05,2.0,1,Palm Jumeirah,Marina Mall,Burj Al Arab,1.0,1.0,,marina residence,2012-01-01,2012,2012,1,,2,18,126,42,5,2,522,,,,,,
3,11-4-2012,2012-01-02 08:36:00,426717,Sales,Sale,1,1,1,emirate living,Building,Building_Villa,1290000.0,165.51,165.51,2.0,0,Damac Properties,Marina Mall,Sports City Swimming Academy,1.0,1.0,,,2012-01-01,2012,2012,1,,0,2,78,20,5,13,0,,,,,,
4,11-3-2012,2012-01-02 08:39:00,567015,Sales,Sale,1,1,1,burj khalifa,Unit,Unit_Flat,1610000.0,113.43,113.43,2.0,1,Buj Khalifa Dubai Mall Metro Station,Dubai Mall,Downtown Dubai,1.0,1.0,,,2012-01-01,2012,2012,1,,2,18,49,16,2,4,0,,,,,,


In [16]:
list(df.columns)

['Transaction Number',
 'Transaction Date',
 'Property ID',
 'Transaction Type',
 'Transaction sub type',
 'Registration type',
 'Is Free Hold?',
 'Usage',
 'Area',
 'Property Type',
 'Property Sub Type',
 'Amount',
 'Transaction Size (sq.m)',
 'Property Size (sq.m)',
 'Room(s)',
 'Parking',
 'Nearest Metro',
 'Nearest Mall',
 'Nearest Landmark',
 'No. of Buyer',
 'No. of Seller',
 'Master Project',
 'Project',
 'dt_month',
 'dt_year',
 'tr_year_num',
 'tr_month_num',
 'Property ID_last',
 'Property Type_lbl',
 'Property Sub Type_lbl',
 'Area_lbl',
 'Nearest Metro_lbl',
 'Nearest Mall_lbl',
 'Nearest Landmark_lbl',
 'Project_lbl',
 'Property Sub Type_trg',
 'Area_trg',
 'Nearest Metro_trg',
 'Nearest Mall_trg',
 'Nearest Landmark_trg',
 'Project_trg']