**Data Loading**

Import libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import warnings

Unzip master data

In [None]:
master_data_path=''
!unzip /content/master_data.zip

Load data to pandas df

In [None]:
# read data
data_path="/content/master_data.csv"
houston_data = pd.read_csv(data_path)
houston_data.head()

Get general information and null values count

In [None]:
# General Info
no_columns=len(houston_data.columns)
print('Total columns',":",no_columns)
print('Total data count',len(houston_data))
# Find null values for each column
print("Null values info")
nulls=houston_data.isnull().sum(axis = 0)
pos=0
for i in nulls:
  print(pos,houston_data.columns[pos],":",i,'Type :',houston_data.dtypes[pos])
  pos+=1

**Data Distribution Analysis**

In [None]:
# import matplotlib.pyplot as plt
houston_data.boxplot(column=['Vacancy %'])

In [None]:
# features = houston_data.drop('', axis=1)
labels1 = houston_data['Vacancy %']
labels2 = houston_data['Vacant SF Direct']

In [None]:
sampled_labels=labels1.sample(n=100000,random_state=1)
sampled_labels.plot(kind='hist')

In [None]:
houston_data_sub=houston_data[['Property ID','Vacancy %']]
Data_based_Id=houston_data_sub.groupby(['Property ID']).mean()

In [None]:
print('Before grouping',len(houston_data_sub))
print('After grouping',len(Data_based_Id))

In [None]:
Data_based_Id.boxplot(column=['Vacancy %'])

In [None]:
# Data_based_Id.head()
sampled_labels=Data_based_Id.sample(n=10000,random_state=1)
sampled_labels.plot(kind='hist')

**Data Cleaning and Manipulation**

Set an updated dataframe that converts categorical values to numerical OHE

In [None]:
import numpy as np

def convert_name(original_name,name):
  # print(name)
  # if name == "nan":
  #   return original_name+": na"
  return original_name+":"+name

# del houston_data_updated
dummy_secondary_type=pd.get_dummies(houston_data['Secondary Type'])
# dummy_secondary_type.rename(columns={np.nan: "Secondary Type Na"})
dummy_secondary_type=dummy_secondary_type.rename(columns={x: convert_name('Secondary Type',x) for x in dummy_secondary_type.columns})
# dummy_secondary_type.head()
houston_data_updated=pd.concat([houston_data, dummy_secondary_type], axis=1)
houston_data_updated = houston_data_updated.drop(['Secondary Type'], axis=1)
del dummy_secondary_type
# houston_data_updated.head()


Set columns for dropping and convertion

In [None]:
# columns to drop
columns_drop = []
# columns for OHE
columns_ohe = []

In [None]:
# columns to convert from Y,N to 1,0
columns_01 = []

Drop columns

In [None]:
houston_data_updated = houston_data_updated.drop(columns_drop, axis=1)

Convert OHE using loop

In [None]:
for col_name in columns_ohe:
  
  if col_name in houston_data_updated.columns:
    dummy_type=pd.get_dummies(houston_data[col_name])
    dummy_type=dummy_type.rename(columns={x: convert_name(col_name,x) for x in dummy_type.columns})
    houston_data_updated=pd.concat([houston_data_updated, dummy_type], axis=1)
    del dummy_type
    houston_data_updated = houston_data_updated.drop([col_name], axis=1)
houston_data_updated.head()

Covert to 0,1

In [None]:
for col_name in columns_01:
  houston_data_updated[col_name]=houston_data_updated[col_name].replace('N',0)
  houston_data_updated[col_name]=houston_data_updated[col_name].replace('Y',1)
houston_data_updated.head()

Test for columns with object type

In [None]:
# find all object columns
k=0
for i in houston_data_updated.columns:
  if houston_data_updated.dtypes[k]=="object":
    print(i)
  k+=1

Split data

In [None]:
columns_drop_corr=[]

houston_data_labels = houston_data_updated['label name']
houston_data_features = houston_data_updated.drop(columns_drop_corr, axis=1)


X_train, X_test, y_train, y_test = train_test_split(houston_data_features, houston_data_labels, test_size=0.4, random_state=42)
# X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

Scale Data

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)
scaler.fit(X_train)


StandardScaler(copy=True, with_mean=False, with_std=True)

In [None]:
x_train_scaled=scaler.transform(X_train)
# x_val_scaled=scaler.transform(X_val)
x_test_scaled=scaler.transform(X_test)


**Machine Learning - XGBoost**

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

XGBoost Parameters: https://xgboost.readthedocs.io/en/latest/parameter.html

GPU seetings:https://xgboost.readthedocs.io/en/latest/gpu/index.html

In [None]:
# Print function
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))
    print('BEST SCORE',results.best_score_)
    means = results.cv_results_['mean_test_score']
    scores = results.scorer_
    print(scores)
    for score, params in zip(means, results.cv_results_['params']):
        print('{} for {}'.format(round(score, 3), params))

Scoring: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [None]:
# User defined MSE
from sklearn.metrics import r2_score, mean_squared_error, make_scorer

def MSE(y_true,y_pred):
    mse = mean_squared_error(y_true, y_pred)
    GPUtil.showUtilization()
    print('MSE: %2.3f' % mse)
    return mse

def two_score(y_true,y_pred):    
    score = MSE(y_true,y_pred) 
    return score

def two_scorer():
    return make_scorer(two_score, greater_is_better=False) 

**Define and Train Model**

Gridsearch: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [None]:
xgb_param_grid = {'learning_rate': [0.1,0.5,1],
                   'max_depth': [15,20,25],
                  'min_child_weight': [0.5,1,1.5,2],
                  'gamma': [0.1,0.2]                  
                  }

model = XGBRegressor(objective='reg:squarederror',tree_method='gpu_hist',gpu_id=0,n_jobs=16)
grid = GridSearchCV(model, xgb_param_grid,scoring=two_scorer())

# Train with unscaled data
grid.fit(X_train,y_train)

# model.fit(X_train, y_train)
joblib.dump(grid.best_estimator_, model_drive_path)
print_results(grid)

| ID | GPU  | MEM |
-------------------
|  0 | 100% | 16% |
MSE: 0.014
| ID | GPU  | MEM |
-------------------
|  0 | 100% | 16% |
MSE: 0.014
| ID | GPU | MEM |
------------------
|  0 | 98% | 16% |
MSE: 0.014
| ID | GPU | MEM |
------------------
|  0 | 57% | 16% |
MSE: 0.014
| ID | GPU  | MEM |
-------------------
|  0 | 100% | 16% |
MSE: 0.014
| ID | GPU  | MEM |
-------------------
|  0 | 100% | 16% |
MSE: 0.014
| ID | GPU | MEM |
------------------
|  0 | 96% | 16% |
MSE: 0.014
| ID | GPU  | MEM |
-------------------
|  0 | 100% | 16% |
MSE: 0.014
| ID | GPU | MEM |
------------------
|  0 | 92% | 16% |
MSE: 0.014
| ID | GPU  | MEM |
-------------------
|  0 | 100% | 16% |
MSE: 0.014
| ID | GPU | MEM |
------------------
|  0 | 60% | 16% |
MSE: 0.014
| ID | GPU | MEM |
------------------
|  0 | 83% | 16% |
MSE: 0.014
| ID | GPU  | MEM |
-------------------
|  0 | 100% | 16% |
MSE: 0.014
| ID | GPU  | MEM |
-------------------
|  0 | 100% | 16% |
MSE: 0.014
| ID | GPU | MEM |
-----

**Test Model**

In [None]:
y_test_pred=grid.predict(X_test)

RMS Test

In [None]:
test_rms=mean_squared_error(y_test_pred,y_test)
print('Test MSE Error :',test_rms)

Sample data for prediction and target

In [None]:
i=0
for x,y in zip(y_test_pred,y_test):
  print('prediction:',x,'real',y)
  if i==100:
    break
  i+=1


**Feature Importance**

In [None]:
pos=0
feat_imp = grid.best_estimator_.feature_importances_
feature_list=[]
for i in feat_imp:
  feature_list.append((houston_data_features.columns[pos],i))
  pos+=1
feature_list_sorted=sorted(feature_list, key=lambda x: x[1], reverse=True)
# print(feature_list_sorted)
pos=0
for i in feature_list_sorted:
  print(pos,":",i)
  pos+=1

In [None]:
import csv

def move_to_csv(file,x,y):
    with open(file, mode='a',newline='') as data_file:
        data_writer = csv.writer(data_file, delimiter=',')
        data_writer.writerow([x,y])

In [None]:
# store feature list in csv
file_out='Feature_Imp.csv'

for i in feature_list_sorted:
  move_to_csv(file_out,i[0],i[1])

**Save the model in pkl file**

In [None]:
# connect to drive to save model
from google.colab import drive
drive.mount('/content/drive')

In [None]:
model_path='../../../XGB_non_scaled.pkl'
grid_path = '../../../Grid_non_scaled.pkl'
drive_path='/content/drive/My Drive/4CModels/'
model_name='XGB_non_scaled_1.pkl'
model_drive_path=drive_path+model_name

In [None]:
joblib.dump(grid.best_estimator_, model_drive_path)

**Load Model**

In [None]:
model_loaded = XGBRegressor(objective='reg:squarederror',tree_method='gpu_hist',gpu_id=0,n_jobs=16)
# model_loaded.load_model(model_path)  # load data
model_loaded = joblib.load(model_drive_path)

In [None]:
pos=0
feat_imp = model_loaded.feature_importances_
feature_list=[]
for i in feat_imp:
  feature_list.append((houston_data_features.columns[pos],i))
  pos+=1
feature_list_sorted=sorted(feature_list, key=lambda x: x[1], reverse=True)
# print(feature_list_sorted)
pos=0
for i in feature_list_sorted:
  print(pos,":",i)
  pos+=1

In [None]:
y_pred=model_loaded.predict(X_test)

In [None]:
# store prediction in csv
file_out='xgboost_prediction.csv'

for x,y in zip(y_pred,y_test):
  move_to_csv(file_out,x,y)

**Unofficial code: Additional Code for debug and trial and error**

Dummy code to check column type and category size

In [None]:
col_name=''
dummy_type=pd.get_dummies(houston_data[col_name])
dummy_type=dummy_type.rename(columns={x: convert_name(col_name,x) for x in dummy_type.columns})
dummy_type


Use wandb to track training (not needed)

In [None]:
!pip install wandb

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/d7/b6/bc2727589a445e5f5fe03a6fb69db66e7175c26021cc581929fbe3f56248/wandb-0.10.9-py2.py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 2.7MB/s 
[?25hCollecting shortuuid>=0.5.0
  Downloading https://files.pythonhosted.org/packages/25/a6/2ecc1daa6a304e7f1b216f0896b26156b78e7c38e1211e9b798b4716c53d/shortuuid-1.0.1-py3-none-any.whl
Collecting sentry-sdk>=0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/86/36/afa3c32f61cb62bef0da7200651d41f36b8b069d9f6254d8df8a20b224b8/sentry_sdk-0.19.2-py2.py3-none-any.whl (127kB)
[K     |████████████████████████████████| 133kB 17.7MB/s 
Collecting configparser>=3.8.1
  Downloading https://files.pythonhosted.org/packages/08/b2/ef713e0e67f6e7ec7d59aea3ee78d05b39c15930057e724cc6d362a8c3bb/configparser-5.0.1-py3-none-any.whl
Collecting docker-pycreds>=0.4.0
  Downloading https://files.pythonhosted.org/packages/f5/e8/f6bd1eee09314e7e6dee49cbe

In [None]:
import wandb
wandb.init()

Load data (not used)

In [None]:
!pip install gputil
!pip install psutil
!pip install humanize

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-cp36-none-any.whl size=7411 sha256=d91f1fb445caafbfb4f4318f0bd86778aabd2dadd1f6bc43fd674c5f83e0d540
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0


In [None]:
import GPUtil
c=GPUtil.showUtilization()
# GPUtil.getGPUs
GPUtil.getAvailable()

| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |


[0]