In [1]:
import csv
import json
import pandas as pd
import pickle
from sklearn import metrics 
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler
from collections import Counter

In [2]:
# reading data from file as a dataframe
def read_data():  
    data_file = open("data_2018.txt","r")
    data_file1 = open("data_2017.txt","r")
    data_file2 = open("data_2016.txt","r")
    file_content = data_file.read()
    file_content1 = data_file1.read()
    file_content2 = data_file2.read()
    json_data = json.loads(file_content)
    json_data1 = json.loads(file_content1)
    json_data2 = json.loads(file_content2)
    json_data+=json_data1
    json_data+=json_data2
    df = pd.DataFrame(json_data)
#     data_file = open("data_2018.txt","r")
#     file_content = data_file.read()
#     json_data = json.loads(file_content)
#     df = pd.DataFrame(json_data)
    return df

In [3]:
# create a dataframe with selected column list,
# and make some filters on dataframe 'price' > 0 and 'TGroup' == specific value passed by user
def select_columns(df,column_list):
    new_df = df[column_list]
    new_df = new_df.loc[(new_df['Price'] > 0) & (df["SStatus"]=="For Sale")]
    return new_df

In [4]:
def fill_missing_values(df,column,value):
    if column == "Floor":
        df.replace('N/A', np.nan,inplace=True)
    else:
        df.replace('', np.nan,inplace=True)
    if value is None:
        value = "None"
    df[column]=df[column].fillna(value)

    return df

In [30]:
# labelized all the columns of dataframe
# def labelized_data(df):
#     for c in df.columns:
#         if df[c].dtype == "object":
#             lbl = preprocessing.LabelEncoder()
#             lbl.fit(list(df[c].values))
#             df[c] = lbl.transform(list(df[c].values))
#     return df

def labelized_data(df):
    from sklearn import preprocessing
    all_label_encoders = {}
    for c in df.columns:
        if df[c].dtype == "object":
            print("c",c)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(df[c].values))
            df[c] = lbl.transform(list(df[c].values))
            all_label_encoders[c] = lbl
    pickle.dump(all_label_encoders,open("all_label_encoders.dict","wb"))

    return df

In [13]:
# creates X and Y
def create_x_y(new_df):
    X = new_df.drop("Price",axis=1)
    Y = new_df["Price"]
    return X,Y

In [20]:
# apply fit and calculate prediction and return mae
def check_models(model,X_train, X_test, y_train, y_test):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    mean_absolute_error = metrics.mean_absolute_error(y_test, y_pred)    
    print('mean_absolute_error',mean_absolute_error)
    return mean_absolute_error,y_pred

In [21]:
def calc_models_mae(models,X_train, X_test, y_train, y_test):
    m={}
#     models_df = pd.DataFrame()
    for model in models:
        print("Applying ",model)
        model_m,y_pred = check_models(models.get(model),X_train, X_test, y_train, y_test)
        m[model] = model_m
#         models_df[model+"_y_pred"] = list(y_pred)
#         print('y test',list(y_test))
#         models_df[model+"_y_test"] = list(y_test)
#     models_df.to_csv('models_predictions.csv')
    return m

In [22]:
def get_top_models(models_mae):
    k = Counter(models_mae)
    top_most = k.most_common()[:-3:-1]
    top = []
    for t in top_most:
        top.append(t[0])
    return top

In [23]:
def get_lowest_model(models_mae):
    return min(models_mae, key=models_mae.get)

In [24]:
def get_models_list():
    GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.1,
                                   max_depth=7, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='lad', random_state =48)
    models={"GBoost":GBoost}
#     rfr = RandomForestRegressor(n_estimators=100)

#     model_xgb = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#            colsample_bytree=0.2, gamma=0.0, importance_type='gain',
#            learning_rate=0.1, max_delta_step=0, max_depth=7,
#            min_child_weight=1.5, missing=None, n_estimators=7200, n_jobs=1,
#            nthread=None, objective='reg:linear', random_state=42,
#            reg_alpha=0.1, reg_lambda=0.1, scale_pos_weight=1, seed=42,
#            silent=1, subsample=0.1)

#     model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
#                                   learning_rate=0.05, n_estimators=720,
#                                   max_bin = 55, bagging_fraction = 0.8,
#                                   bagging_freq = 5, feature_fraction = 0.2319,
#                                   feature_fraction_seed=9, bagging_seed=9,
#                                   min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
    
#     lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
    
#     ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
    
#     KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
#     models = {"GBoost":GBoost,"rfr":rfr,"model_xgb":model_xgb,"model_lgb":model_lgb,"lasso":lasso,"ENet":ENet,"KRR":KRR}
        
    return models

In [25]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [26]:
def get_area_value(row):
    if row['FArea']>row['PArea']:
        return row['FArea']
    elif row['PArea']>row['FArea']:
        return row['PArea']
    else:
        return 0

def merge_area(df):
    df['Area'] = df.apply(get_area_value,axis=1)
    return df

In [32]:
''' 
this is the main function for each TGroup where you have to pass one 'group' from ["C","F","G","H","M","O","P","T","V"]
and a column list for the group given that must include two columns "Price" and "TGroup".
'''
def main(column_list):
    # read data from file as dataframe.
    df = read_data()
    
    # get only those columns mention in column_list.
    df = select_columns(df,column_list)
    
    df = merge_area(df)
    drop_columns = ['FArea','PArea','SStatus']
    df.drop(drop_columns,axis=1,inplace=True)
    for d in drop_columns:
        column_list.remove(d)
        
    # replace all blank cells by None from dataframe.
    missing_values = {"Floor":"No Floor","Views":"no Views","Form":"no Form"}
    for col in column_list:
        df = fill_missing_values(df,col,missing_values.get(col))
        
    # labelizing the column values if it is not int or float.
    df = labelized_data(df)
    
    # creates X and Y dataframe.
    X,Y = create_x_y(df)
    
    # creates X_train, X_test, y_train and y_test.
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    # create following models object in a dictionary
    # [GradientBoostingRegressor, RandomForestRegressor, XGBRegressor, LGBMRegressor, Lasso, ElasticNet, KernelRidge] 
    models = get_models_list()
    
    # fit, predict price, and calculate mean absolute error for each model and return as a dictionary
    models_mae = calc_models_mae(models,X_train, X_test, y_train, y_test)
    
#     # get 2 models that have lowest mean absolute error
#     average_models = get_top_models(models_mae)
    
#     # creates object of AveragingModels class by with your 2 lowest value models
#     averaged_models = AveragingModels(models=(models.get(average_models[0]),models.get(average_models[1])))
    
#     # fit, predict price, and calculate mean absolute error for 2 lowest value models
#     print("Average of 2 top models")
#     average_pred,y_pred = check_models(averaged_models,X_train, X_test, y_train, y_test)
#     models_mae['average'] = average_pred
    
#     # getting lowest mean error value model
#     if models_mae.get(average_models[0]) < models_mae.get('average'):
#         lowest_value_model = average_models[0]
#     elif models_mae.get(average_models[0]) > models_mae.get('average'):
#         lowest_value_model = 'average'
        
    # model name
#     model_name = lowest_value_model+".sav"
    model_name = "GBoost_model.sav"
    
    # saving model
    print('model ',model_name," saving...")
    pickle.dump(models.get("GBoost"), open(model_name, 'wb'))
    
    return models_mae
    

In [31]:
column_list = ["Bedrooms","Location","Cars","Floor","Form","Type","Views","FArea","PArea","Price","SStatus"]
output = main(column_list)
print(output)

c Location
c Floor
c Form
c Type
c Views
Applying  GBoost
mean_absolute_error 186108.2959541833
model  GBoost_model.sav  saving...
{'GBoost': 186108.2959541833}


In [29]:
column_list = ["Bedrooms","Location","Cars","Floor","Form","Type","Views","FArea","PArea","Price","SStatus"]
output = main(column_list)

Applying  GBoost
mean_absolute_error 186108.2959541833
model  GBoost_model.sav  saving...


In [26]:
def count_blank_nan(new_df):
    count_nan=new_df.isnull().sum(axis=0)
    empty_count = new_df.replace('',np.nan).isnull().sum(axis=0)
    nan_attributes = count_nan.to_dict()
    empty_count = empty_count.subtract(count_nan, axis=0)
    empty_attributes = empty_count.to_dict()
    return empty_attributes,nan_attributes

In [65]:
def main2(group,file):
    df = read_data()
    drop_column_list = ["Board", "CArea", "DateInserted","SaleWriteup",\
                    "AvailableDate","Yearly","RightToBuild","OnPlan",\
                   "Declaration","CStatus","Keys","Ref","Street","UpdDate",\
                   "InspDate","GREDate","PropertyAmendments",\
                    "PropertyDetails","PlanNo", "SDate", "LeaseWriteup",\
                    "ClientSignature", "LStatus", "LDate", "CStatus", \
                    "Premium", "Valuation", "PriceSqMtr", "Signed","Regulation","TypeOfConstr",\
                   ]
    new_df = df.drop(drop_column_list,axis=1)
    new_df = new_df.loc[(new_df['Price'] > 0) & (new_df['TGroup'] == group) & (new_df['SStatus'] == "For Sale")]
    total = len(new_df.index)
    empty,nan=count_blank_nan(new_df)
    floor_Df = len(new_df.loc[new_df['Floor']=="N/A"].index)
    bedroom_df = len(new_df.loc[new_df['Bedrooms']==0].index)
    cars_df = len(new_df.loc[new_df['Cars']==0].index)
    daily_df = len(new_df.loc[new_df['Daily']==0].index)
    grent_df = len(new_df.loc[new_df['GRent']==0].index)
    gprice_df = len(new_df.loc[new_df['GarPrice']==0].index)
    monthly = len(new_df.loc[new_df['Monthly']==0].index)
    PArea = len(new_df.loc[new_df['PArea']==0].index)
    
    data=[empty,nan,{"Floor":floor_Df},{"Bedrooms":bedroom_df,"Cars":cars_df,"PArea":PArea,\
                                                   "Daily":daily_df,"GRent":grent_df,\
                                                  "GarPrice":gprice_df,"Monthly":monthly},{"Bedrooms":total}]
    df = pd.DataFrame(data,index = ['blank','nan','N/A','0','total'])

    df.to_csv(file)

In [66]:
files=[]
for typeg in ['C','F','G','H','M','O','P','T','V']:
    file = "group_"+typeg+".csv"
    main2(typeg,file)
    files.append(file)
    

In [67]:
csv_merge = open("all_group.csv", 'w')

for file in files:
    csv_in = open(file)
    csv_merge.write(file.split('.')[0])
    for line in csv_in:
        csv_merge.write(line)
    csv_merge.write('\n')
    csv_in.close()
csv_merge.close()

In [None]:
# few examples are here
column_list = ["Form","Bedrooms","Location","TGroup","Floor","FArea","Price"]
group = "F"
output = main(column_list,group)
print(output)

Applying  rfr
mean_absolute_error 58138.740204471236
Applying  model_lgb
mean_absolute_error 74864.29151806787
Applying  GBoost
mean_absolute_error 50068.40186224562
Applying  KRR


  overwrite_a=False)
  if getattr(data, 'base', None) is not None and \


mean_absolute_error 103625.06836916365
Applying  lasso
mean_absolute_error 121955.15116598991
Applying  model_xgb
mean_absolute_error 79203.73418397724
Applying  ENet
mean_absolute_error 121954.24270625929
Average of 2 top models


In [22]:
column_list = ["Price","TGroup","Cars","Form","Garage","Location","PArea","Bedrooms","Floor","Views"]
group = "M"
output = main(column_list,group)
print(output)

Applying  rfr
mean_absolute_error 51414.8287938913
Applying  model_lgb
mean_absolute_error 54050.451515636705
Applying  GBoost
mean_absolute_error 51864.23422085096
Applying  KRR
mean_absolute_error 67840.61450859367
Applying  lasso
mean_absolute_error 70893.55832471915
Applying  model_xgb


  if getattr(data, 'base', None) is not None and \


mean_absolute_error 62431.133138020836
Applying  ENet
mean_absolute_error 70892.97146192957
Average of 2 top models
mean_absolute_error 49793.75857561299
{'ENet': 70892.97146192957, 'rfr': 51414.8287938913, 'model_lgb': 54050.451515636705, 'average': 49793.75857561299, 'GBoost': 51864.23422085096, 'KRR': 67840.61450859367, 'lasso': 70893.55832471915, 'model_xgb': 62431.133138020836}


In [23]:
column_list = ["Price","TGroup","Form","Location","PArea","Floor","Views"]
group = "C"
output = main(column_list,group)
print(output)

Applying  rfr
mean_absolute_error 944507.0976702508
Applying  model_lgb
mean_absolute_error 871709.7004595605
Applying  GBoost
mean_absolute_error 541296.6334807018
Applying  KRR
mean_absolute_error 1049066.3287387125
Applying  lasso
mean_absolute_error 894255.5927289847
Applying  model_xgb


  if getattr(data, 'base', None) is not None and \


mean_absolute_error 1355802.2283266129
Applying  ENet
mean_absolute_error 894232.2679416342
Average of 2 top models
mean_absolute_error 692249.418000178
{'ENet': 894232.2679416342, 'rfr': 944507.0976702508, 'model_lgb': 871709.7004595605, 'average': 692249.418000178, 'GBoost': 541296.6334807018, 'KRR': 1049066.3287387125, 'lasso': 894255.5927289847, 'model_xgb': 1355802.2283266129}
