In [64]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import lightgbm as lgb
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings("ignore")

In [32]:
train_df = pd.read_csv('Housing_dataset_train.csv')
test_df = pd.read_csv('Housing_dataset_test.csv')
sub = pd.read_csv('Sample_submission.csv')

In [33]:
train_df = train_df.reset_index(drop=True)
train_df = train_df.dropna(subset=['loc', 'title'])

train_df

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,price
0,3583,Katsina,Semi-detached duplex,2.0,2.0,1.0,1149999.565
1,2748,Ondo,Apartment,,2.0,4.0,1672416.689
3,2224,Anambra,Detached duplex,5.0,2.0,4.0,2410306.756
4,10300,Kogi,Terrace duplex,,5.0,6.0,2600700.898
5,1733,Borno,Mansion,,1.0,3.0,1341750.867
...,...,...,...,...,...,...,...
13994,10477,Taraba,Detached duplex,8.0,1.0,6.0,2837199.086
13995,6175,Edo,Bungalow,,7.0,,2367927.861
13996,9704,Kaduna,Apartment,,7.0,5.0,2228516.471
13997,11190,Plateau,Bungalow,8.0,6.0,5.0,2406812.693


In [34]:
train_df.drop_duplicates(inplace = True)

train_df.parking_space = train_df.parking_space.fillna(train_df.parking_space.mean())
train_df.bedroom = train_df.bedroom.fillna(train_df.bedroom.mean())
train_df.bathroom = train_df.bathroom.fillna(train_df.bathroom.mean())

train_df.isnull().sum()

ID               0
loc              0
title            0
bedroom          0
bathroom         0
parking_space    0
price            0
dtype: int64

In [35]:
train_df

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,price
0,3583,Katsina,Semi-detached duplex,2.000000,2.0,1.000000,1149999.565
1,2748,Ondo,Apartment,4.296861,2.0,4.000000,1672416.689
3,2224,Anambra,Detached duplex,5.000000,2.0,4.000000,2410306.756
4,10300,Kogi,Terrace duplex,4.296861,5.0,6.000000,2600700.898
5,1733,Borno,Mansion,4.296861,1.0,3.000000,1341750.867
...,...,...,...,...,...,...,...
13994,10477,Taraba,Detached duplex,8.000000,1.0,6.000000,2837199.086
13995,6175,Edo,Bungalow,4.296861,7.0,3.163883,2367927.861
13996,9704,Kaduna,Apartment,4.296861,7.0,5.000000,2228516.471
13997,11190,Plateau,Bungalow,8.000000,6.0,5.000000,2406812.693


In [36]:
print("missing values:", train_df.isnull().sum())

missing values: ID               0
loc              0
title            0
bedroom          0
bathroom         0
parking_space    0
price            0
dtype: int64


In [37]:
label_encoding = LabelEncoder()

train_df['loc'] = label_encoding.fit_transform(train_df['loc'])
train_df['title'] = label_encoding.fit_transform(train_df['title'])

train_df

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,price
0,3583,19,7,2.000000,2.0,1.000000,1149999.565
1,2748,27,0,4.296861,2.0,4.000000,1672416.689
3,2224,3,3,5.000000,2.0,4.000000,2410306.756
4,10300,21,8,4.296861,5.0,6.000000,2600700.898
5,1733,7,5,4.296861,1.0,3.000000,1341750.867
...,...,...,...,...,...,...,...
13994,10477,33,3,8.000000,1.0,6.000000,2837199.086
13995,6175,11,1,4.296861,7.0,3.163883,2367927.861
13996,9704,17,0,4.296861,7.0,5.000000,2228516.471
13997,11190,30,1,8.000000,6.0,5.000000,2406812.693


In [38]:
test_df['loc'] = label_encoding.fit_transform(test_df['loc'])
test_df['title'] = label_encoding.fit_transform(test_df['title'])

test_df

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space
0,845,18,6,4,1,2
1,1924,1,0,2,2,4
2,10718,1,1,2,7,2
3,12076,23,5,9,5,2
4,12254,14,7,5,6,1
...,...,...,...,...,...,...
5995,1594,12,4,4,5,2
5996,2416,1,8,5,7,1
5997,10195,29,9,4,1,4
5998,9455,4,4,3,7,5


In [39]:
train_df['loc'].unique()

array([19, 27,  3, 21,  7, 22, 28, 17, 26,  5,  0, 31, 33, 10, 20, 13, 11,
       24, 18,  9, 34,  6,  4,  8, 25,  1, 30, 15, 29, 35, 32, 16, 14, 23,
        2, 12])

In [40]:
train_df['title'].unique()

array([7, 0, 3, 8, 5, 1, 6, 9, 4, 2])

In [41]:
test_df['loc'].unique()

array([18,  1, 23, 14, 11, 25,  3,  6, 26, 21, 33, 34, 35,  7, 13, 12,  9,
       28, 29,  4, 22, 19,  8, 27,  2, 24, 10, 30, 17, 32, 20,  5, 16,  0,
       31, 15])

In [42]:
test_df['title'].unique()

array([6, 0, 1, 5, 7, 3, 4, 9, 2, 8])

In [43]:
train_df['bbratio'] = train_df['bathroom']/train_df['bedroom']

In [65]:
data = train_df.drop("price", axis=1).append(test_df)

In [45]:
train_df['title'] = train_df['title'].astype(int)

In [46]:
test_df['bbratio'] = test_df['bathroom']/test_df['bedroom']

In [47]:
data

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,bbratio
0,3583,19,7,2.000000,2.0,1.0,1.000000
1,2748,27,0,4.296861,2.0,4.0,0.465456
3,2224,3,3,5.000000,2.0,4.0,0.400000
4,10300,21,8,4.296861,5.0,6.0,1.163640
5,1733,7,5,4.296861,1.0,3.0,0.232728
...,...,...,...,...,...,...,...
5995,1594,12,4,4.000000,5.0,2.0,
5996,2416,1,8,5.000000,7.0,1.0,
5997,10195,29,9,4.000000,1.0,4.0,
5998,9455,4,4,3.000000,7.0,5.0,


In [48]:
X = data[:train_df.shape[0]]
Y = train_df.price
test_data = data[train_df.shape[0]:]

test_data

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,bbratio
0,845,18,6,4.0,1.0,2.0,
1,1924,1,0,2.0,2.0,4.0,
2,10718,1,1,2.0,7.0,2.0,
3,12076,23,5,9.0,5.0,2.0,
4,12254,14,7,5.0,6.0,1.0,
...,...,...,...,...,...,...,...
5995,1594,12,4,4.0,5.0,2.0,
5996,2416,1,8,5.0,7.0,1.0,
5997,10195,29,9,4.0,1.0,4.0,
5998,9455,4,4,3.0,7.0,5.0,


In [49]:
X

Unnamed: 0,ID,loc,title,bedroom,bathroom,parking_space,bbratio
0,3583,19,7,2.000000,2.0,1.000000,1.000000
1,2748,27,0,4.296861,2.0,4.000000,0.465456
3,2224,3,3,5.000000,2.0,4.000000,0.400000
4,10300,21,8,4.296861,5.0,6.000000,1.163640
5,1733,7,5,4.296861,1.0,3.000000,0.232728
...,...,...,...,...,...,...,...
13994,10477,33,3,8.000000,1.0,6.000000,0.125000
13995,6175,11,1,4.296861,7.0,3.163883,1.629096
13996,9704,17,0,4.296861,7.0,5.000000,1.629096
13997,11190,30,1,8.000000,6.0,5.000000,0.750000


In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [51]:
lgb = LGBMRegressor()
lgb.fit(X_train, y_train)
lgb_preds = lgb.predict(X_test)

print(f'mse = {mean_squared_error(y_test, lgb_preds, squared=False)}')

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 391
[LightGBM] [Info] Number of data points in the train set: 8424, number of used features: 7
[LightGBM] [Info] Start training from score 2131946.679554
mse = 383177.05698390736


In [52]:
predictions = lgb.predict(test_df)
predictions

array([2110463.48036149, 1030752.35448767, 1200625.03977689, ...,
       2022342.58073244, 1361637.95023712, 3493006.58341886])

In [53]:
fold_pred=[]
oof_pred = []

params = {
    'n_estimators': 500,
    'colsample_bytree': 0.86,
 'learning_rate': 0.032,
 'max_depth': 7,
 'subsample': 0.85}


fold = KFold(n_splits=7, shuffle=True)#15#5#10
i=1
for train_index, test_index in fold.split(X,Y):     
  
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = np.log1p(Y.iloc[train_index]), Y.iloc[test_index]

    model = LGBMRegressor(**params, objective = "rmse")
    model.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)])#erly100

    preds= model.predict(X_test)
    print("err: ",(mean_squared_error(y_test,np.expm1(preds), squared=False)))  #Reverse transformation
    oof_pred.append(mean_squared_error(y_test,np.expm1(preds),squared=False))
    
    p2 = model.predict(test_data[X.columns])
    fold_pred.append(np.expm1(p2))
    

print(np.mean(oof_pred))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 391
[LightGBM] [Info] Number of data points in the train set: 9026, number of used features: 7
[LightGBM] [Info] Start training from score 14.466834
err:  428022.1881133808
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 391
[LightGBM] [Info] Number of data points in the train set: 9026, number of used features: 7
[LightGBM] [Info] Start training from score 14.469397
err:  450729.1915324796
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 391
[LightGBM] [Info] Number of data points in the train set: 9026, number of used features: 7
[LightGBM] [Info] Start training from score 14.468764
err:  481136.78436778253
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 391
[LightGBM] [Info] Number of data points in the train set: 9027, number of used features: 7
[LightGBM] [Info] Start training from

err:  453295.0683653013
425904.86530335515


In [57]:
oof_pred

[428022.1881133808,
 450729.1915324796,
 481136.78436778253,
 345471.335078957,
 400093.0768673809,
 422586.4127982039,
 453295.0683653013]

In [58]:
fold_pred

[array([2286373.84926395, 1064955.00627636, 1251731.86217336, ...,
        2099512.68373304, 1344300.60889146, 3250951.88335936]),
 array([2320473.28739346, 1077086.03246816, 1253550.49975732, ...,
        2093033.70935823, 1457734.12646811, 3284274.37923654]),
 array([2272703.29381462, 1087235.75490477, 1255445.64163077, ...,
        2055312.75226159, 1412834.91129614, 3379849.632681  ]),
 array([2258467.35464342, 1085463.6098681 , 1239644.99938098, ...,
        2155248.14029661, 1470379.99072775, 3386463.98311498]),
 array([2170748.11045867, 1045970.5090689 , 1192918.36689676, ...,
        2031782.66421574, 1432806.51783996, 3352390.5147385 ]),
 array([2319135.49139207, 1093503.3060932 , 1223867.9172493 , ...,
        2053367.63727786, 1403916.01174038, 3250088.27600446]),
 array([2295081.8146327 , 1060587.11859101, 1263151.26501479, ...,
        2063043.80735366, 1449182.85383606, 3408950.50195759])]

In [59]:
sub['price'] =  np.mean(fold_pred, axis = 0)

In [60]:
sub.to_csv('Light gb sun 56.csv', index=False)

In [62]:
sub['price'] = predictions

In [63]:
sub.to_csv('Light gb sun 561.csv', index=False)