In [1]:
import numpy
import pymongo 
from sklearn.feature_extraction import DictVectorizer 
import h2o 
import pandas 
import re 
import random 

In [2]:
def clean_record(json_record):
    record = json.loads(json_record)
    return pandas.io.json.json_normalize(record)

def flatten_record(json_record):
    record = json.loads(json_record)
    facilities = pandas.io.json.json_normalize(record['facilities'])
    number_of_rooms = record['number_of_rooms']
    return facilities

def json_flatten(json_record, sep='::'):

    def flatten(record, node=''):
        new_record = {}
        if type(record) is dict:
            for k, v in record.items():
                if node is not '':
                    new_record.update(flatten(v, node + sep + k))
                else:
                    new_record.update(flatten(v, node + k)) 
        else:
            new_record.update({node : record })
        return new_record

    record = flatten(json_record)

    return record


def remove_empty_nodes(record):
    new_record = record.copy()
    for k, v in record.items():
        if type(v) is list:
            if len(v) is 0:
                del new_record[k]
        elif type(v) is str:
            if len(v.strip()) is 0:
                del new_record[k]
    return new_record

def remove_nodes(record, nodes=[]):
    new_record = record.copy()
    for node in nodes:
        if node in new_record:
            del new_record[node]
    return new_record

def list_to_features(list_feature): 
    if type(list_feature) is list:
        return {value.strip(): 1 for value in list_feature} 
    
def string_to_features(string_feature, sep=','):
    if type(string_feature) is str:
        return {value.strip(): 1 for value in string_feature.split(sep) if len(value.strip()) is not 0}
    
def normalize_features(features): 
    return [re.sub('[^a-zA-Z0-9_.]', '', feature.strip().replace(" ", "_").replace("/", "_").replace("-", "_").replace(">", "greater").replace("<", "lower")) for feature in features]

def rooms_features(record): 
    new_records = []
    for room in record['rooms']: 
        facilities = room['facilities']
        size_string = re.findall('\d*\.?\d*[ ]* m²', facilities)
        if len(size_string) > 0: 
            size = float(re.findall('\d*\.?\d*', size_string[0])[0])
        else: 
            size = 0 
        facilities = re.sub('\d*\.?\d*[ ]* m²', '', facilities)
        new_record = string_to_features(facilities) 
        if size > 0: 
            new_record.update({'Size': size})
        name = room['name']
        new_records.append(new_record) 
    return new_records 

def prepare_record(record):
    new_record = record.copy()
    for v in new_record['facilities']:
        new_record['facilities'][v] = list_to_features(record['facilities'][v])
    if len(record['surroundings'].strip()) is not 0:
        new_record['surroundings'] = string_to_features(record['surroundings'])
    else:
        del new_record['surroundings'] 
    new_record = remove_nodes(new_record, ['_id', 'rooms', 'description', 'reviews', 'url', 'updated', 'chain', 'popular_facilities', 'title', 'highlights', 'check_out', 'check_in', 'hotel_id'])
    new_record = json_flatten(new_record, '.')
    new_record = remove_empty_nodes(new_record)
    if 'number_of_rooms' in new_record: 
        #new_record['number_of_rooms'] = int(new_record['number_of_rooms']) 
        del new_record['number_of_rooms'] 
    return { re.sub('[^a-zA-Z0-9_.]', '', k.replace(" ", "_").replace("/", "_").replace("-", "_").replace(">", "greater").replace("<", "lower")): v for k, v in new_record.items()}

def mongo_collection(host='localhost', port=27017, db='booking', collection='hotel_details'):
    client = pymongo.MongoClient(host, port)
    return client[db][collection]

def rooms_records(records): 
    new_records = []
    y = []
    for record in records:
        for new_record in rooms_features(record): 
            new_records.append(new_record)
            y.append(record['type'])  
    return new_records, y 

def split_records(records, ratio): 
    train = [] 
    valid = []
    for record in records: 
        if random.random() <= ratio: 
            train.append(record) 
        else: 
            valid.append(record)    
    return train, valid 

In [3]:
collection = mongo_collection() 
records = collection.find({'$or': [{'type': 'apartments'}, {'type': 'hotel'}]})

In [4]:
room_features, y = rooms_records(records) #Extract room features and types from property records 

In [5]:
vectorizer = DictVectorizer() 
X = vectorizer.fit_transform(room_features)

In [6]:
Xy = pandas.DataFrame(X.toarray(), columns=normalize_features(vectorizer.get_feature_names())) 
Xy['type'] = y  

In [7]:
h2o.init()

Connecting to H2O server at http://localhost:54321... successful!


0,1
H2O cluster uptime:,14 hours 6 mins
H2O cluster version:,3.10.0.3
H2O cluster version age:,18 days
H2O cluster name:,H2O_from_python_I_eu98dh
H2O cluster total nodes:,1
H2O cluster free memory:,3.133 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster is healthy:,True
H2O cluster is locked:,True


In [8]:
Xywater = h2o.H2OFrame.from_python(Xy.to_dict('list'), destination_frame='Xy')





In [9]:
Xy_train, Xy_valid = Xywater.split_frame([0.8], ['Xy_train', 'Xy_valid'])

In [10]:
xcolumns = Xywater.columns.copy() 
xcolumns.remove('type')  

In [11]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator 
modelGBM = H2OGradientBoostingEstimator(max_depth=50, ntrees=165)
from h2o.estimators.random_forest import H2ORandomForestEstimator 
modelRF = H2ORandomForestEstimator(max_depth=50, ntrees=165, balance_classes=True)

In [12]:
modelGBM.train(training_frame=Xy_train, validation_frame=Xy_valid, x=xcolumns, y='type')
modelRF.train(training_frame=Xy_train, validation_frame=Xy_valid, x=xcolumns, y='type')





In [13]:
modelGBM

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Method
Model Key:  GBM_model_python_1471642286902_1731

Model Summary: 


0,1,2,3,4,5,6,7,8,9
,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,165.0,165.0,9107986.0,0.0,50.0,23.50909,1.0,12687.0,4351.6665




ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.007681162433891717
RMSE: 0.08764224115055318
LogLoss: 0.02982552096743362
Mean Per-Class Error: 0.013732525682334185
AUC: 0.9991806670641685
Gini: 0.998361334128337

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.5209123264068648: 


0,1,2,3,4
,apartments,hotel,Error,Rate
apartments,37526.0,1482.0,0.038,(1482.0/39008.0)
hotel,647.0,169395.0,0.0038,(647.0/170042.0)
Total,38173.0,170877.0,0.0102,(2129.0/209050.0)



Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.5209123,0.9937551,217.0
max f2,0.3224588,0.9960079,260.0
max f0point5,0.7359087,0.9946994,158.0
max accuracy,0.5209123,0.9898158,217.0
max precision,0.9997318,1.0,0.0
max recall,0.0471183,1.0,343.0
max specificity,0.9997318,1.0,0.0
max absolute_mcc,0.5437183,0.9662778,210.0
max min_per_class_accuracy,0.7835759,0.9861054,146.0



Gains/Lift Table: Avg response rate: 81.34 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100120,0.9997655,1.2294021,1.2294021,1.0,1.0,0.0123087,0.0123087,22.9402148,22.9402148
,2,0.02,0.9997562,1.2294021,1.2294021,1.0,1.0,0.0122793,0.0245880,22.9402148,22.9402148
,3,0.0300024,0.9997513,1.2294021,1.2294021,1.0,1.0,0.0122970,0.0368850,22.9402148,22.9402148
,4,0.04,0.9997470,1.2294021,1.2294021,1.0,1.0,0.0122911,0.0491761,22.9402148,22.9402148
,5,0.0500024,0.9997442,1.2294021,1.2294021,1.0,1.0,0.0122970,0.0614730,22.9402148,22.9402148
,6,0.1,0.9997307,1.2294021,1.2294021,1.0,1.0,0.0614672,0.1229402,22.9402148,22.9402148
,7,0.1500024,0.9997136,1.2294021,1.2294021,1.0,1.0,0.0614730,0.1844133,22.9402148,22.9402148
,8,0.2092466,0.9996953,1.2294021,1.2294021,1.0,1.0,0.0728349,0.2572482,22.9402148,22.9402148
,9,0.3000048,0.9996325,1.2294021,1.2294021,1.0,1.0,0.1115783,0.3688265,22.9402148,22.9402148




ModelMetricsBinomial: gbm
** Reported on validation data. **

MSE: 0.030860216636336917
RMSE: 0.1756707620417721
LogLoss: 0.11082795676136249
Mean Per-Class Error: 0.05581850494400242
AUC: 0.9869279458287374
Gini: 0.9738558916574749

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.4887771904430215: 


0,1,2,3,4
,apartments,hotel,Error,Rate
apartments,8418.0,1234.0,0.1278,(1234.0/9652.0)
hotel,878.0,41655.0,0.0206,(878.0/42533.0)
Total,9296.0,42889.0,0.0405,(2112.0/52185.0)



Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4887772,0.9752757,213.0
max f2,0.1222169,0.9835753,322.0
max f0point5,0.7943023,0.9783724,116.0
max accuracy,0.4958560,0.9595286,211.0
max precision,0.9997064,0.9997504,0.0
max recall,0.0015713,1.0,398.0
max specificity,0.9997064,0.9995856,0.0
max absolute_mcc,0.5733453,0.8646606,187.0
max min_per_class_accuracy,0.8603467,0.9438789,91.0



Gains/Lift Table: Avg response rate: 81.50 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100029,0.9997643,1.2222288,1.2222288,0.9961686,0.9961686,0.0122258,0.0122258,22.2228798,22.2228798
,2,0.0200249,0.9997556,1.2269297,1.2245815,1.0,0.9980861,0.0122963,0.0245221,22.6929678,22.4581487
,3,0.0300086,0.9997508,1.2269297,1.2253627,1.0,0.9987229,0.0122493,0.0367714,22.6929678,22.5362718
,4,0.0400115,0.9997465,1.2269297,1.2257545,1.0,0.9990421,0.0122728,0.0490443,22.6929678,22.5754458
,5,0.0500144,0.9997436,1.2269297,1.2259895,1.0,0.9992337,0.0122728,0.0613171,22.6929678,22.5989502
,6,0.1000096,0.9997298,1.2269297,1.2264595,1.0,0.9996168,0.0613406,0.1226577,22.6929678,22.6459500
,7,0.1500048,0.9997118,1.2269297,1.2266162,1.0,0.9997445,0.0613406,0.1839983,22.6929678,22.6616206
,8,0.2048098,0.9996953,1.2269297,1.2267001,1.0,0.9998129,0.0672419,0.2512402,22.6929678,22.6700088
,9,0.3000287,0.9996264,1.2261889,1.2265379,0.9993963,0.9996807,0.1167564,0.3679966,22.6188928,22.6537863




Scoring History: 


0,1,2,3,4,5,6,7,8,9,10,11,12,13
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_lift,validation_classification_error
,2016-08-20 04:41:31,0.004 sec,0.0,0.3895873,0.4812501,0.5,1.0,0.1865965,0.3882664,0.4788368,0.5,1.0,0.1849574
,2016-08-20 04:41:37,6.292 sec,1.0,0.3576011,0.4103134,0.9851232,1.2294021,0.0546903,0.3586916,0.4131987,0.9566921,1.2146046,0.0696177
,2016-08-20 04:41:44,13.582 sec,2.0,0.3311628,0.3621254,0.9874836,1.2294021,0.0478259,0.3348148,0.3691562,0.9625892,1.1611174,0.0652295
,2016-08-20 04:41:53,22.022 sec,3.0,0.3084052,0.3250297,0.9888026,1.2294021,0.0447022,0.3148243,0.3357410,0.9662810,1.1752200,0.0636198
,2016-08-20 04:42:02,31.728 sec,4.0,0.2884505,0.2947629,0.9898827,1.2294021,0.0418704,0.2977120,0.3087435,0.9690453,1.1846218,0.0610329
---,---,---,---,---,---,---,---,---,---,---,---,---,---
,2016-08-20 04:49:46,8 min 15.582 sec,49.0,0.0938987,0.0394570,0.9988387,1.2294021,0.0107056,0.1790921,0.1132142,0.9858662,1.2269297,0.0422727
,2016-08-20 04:50:41,9 min 10.580 sec,55.0,0.0908795,0.0354610,0.9989769,1.2294021,0.0104568,0.1777161,0.1117559,0.9862450,1.2269297,0.0416786
,2016-08-20 04:51:36,10 min 5.070 sec,62.0,0.0891211,0.0326512,0.9990763,1.2294021,0.0102798,0.1768119,0.1110932,0.9865437,1.2269297,0.0412187



See the whole table with table.as_data_frame()

Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Kitchenware,60812.0468750,1.0,0.4123504
Size,9237.3789062,0.1519005,0.0626362
Telephone,8208.8867188,0.1349878,0.0556623
Washing_Machine,4094.0510254,0.0673230,0.0277607
Private_flat_in_building,3083.5512695,0.0507063,0.0209087
---,---,---,---
Sea___Lake___City_view,0.0,0.0,0.0
Sea___Garden___Mountain___River_view,0.0,0.0,0.0
Lake___Garden___Pool___City___River_view,0.0,0.0,0.0



See the whole table with table.as_data_frame()




In [14]:
modelRF

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_model_python_1471642286902_1836

Model Summary: 


0,1,2,3,4,5,6,7,8,9
,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,165.0,165.0,34644050.0000000,50.0,50.0,50.0,15662.0,17853.0,16517.51




ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.019796910108639373
RMSE: 0.14070149291546047
LogLoss: 0.07930913740887502
Mean Per-Class Error: 0.01970583110269375
AUC: 0.9978150522987355
Gini: 0.9956301045974709

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6215167233602563: 


0,1,2,3,4
,apartments,hotel,Error,Rate
apartments,166209.0,3849.0,0.0226,(3849.0/170058.0)
hotel,2853.0,167189.0,0.0168,(2853.0/170042.0)
Total,169062.0,171038.0,0.0197,(6702.0/340100.0)



Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.6215167,0.9803507,150.0
max f2,0.4313252,0.9859024,204.0
max f0point5,0.7951233,0.9833793,101.0
max accuracy,0.6215167,0.9802940,150.0
max precision,0.9999962,1.0,0.0
max recall,0.0000055,1.0,399.0
max specificity,0.9999962,1.0,0.0
max absolute_mcc,0.6215167,0.9606046,150.0
max min_per_class_accuracy,0.6671398,0.9801009,139.0



Gains/Lift Table: Avg response rate: 81.34 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.2502559,1.0,1.2294021,1.2294021,1.0,1.0,0.3076652,0.3076652,22.9402148,22.9402148
,2,0.3060368,0.9999974,1.2294021,1.2294021,1.0,1.0,0.0685772,0.3762423,22.9402148,22.9402148
,3,0.4019230,0.9995359,1.2294021,1.2294021,1.0,1.0,0.1178826,0.4941250,22.9402148,22.9402148
,4,0.5000478,0.9981641,1.2294021,1.2294021,1.0,1.0,0.1206349,0.6147599,22.9402148,22.9402148
,5,0.6014733,0.9943334,1.2294021,1.2294021,1.0,1.0,0.1246927,0.7394526,22.9402148,22.9402148
,6,0.7,0.9769320,1.2289843,1.2293433,0.9996601,0.9999522,0.1210877,0.8605403,22.8984329,22.9343339
,7,0.8000191,0.8408288,1.2111160,1.2270645,0.9851260,0.9980986,0.1211348,0.9816751,21.1116048,22.7064546
,8,0.9,0.0239222,0.1832839,1.1111111,0.1490838,0.9037817,0.0183249,1.0,-81.6716086,11.1111111
,9,1.0,0.0,0.0,1.0,0.0,0.8134035,0.0,1.0,-100.0,0.0




ModelMetricsBinomial: drf
** Reported on validation data. **

MSE: 0.04138276862047496
RMSE: 0.20342755128171544
LogLoss: 0.13860834798427904
Mean Per-Class Error: 0.052382926305660105
AUC: 0.9864218482206483
Gini: 0.9728436964412965

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.7333237759913985: 


0,1,2,3,4
,apartments,hotel,Error,Rate
apartments,8564.0,1088.0,0.1127,(1088.0/9652.0)
hotel,1094.0,41439.0,0.0257,(1094.0/42533.0)
Total,9658.0,42527.0,0.0418,(2182.0/52185.0)



Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.7333238,0.9743475,143.0
max f2,0.5277046,0.9818211,212.0
max f0point5,0.8683258,0.9792463,93.0
max accuracy,0.7333238,0.9581872,143.0
max precision,0.9990455,0.9995571,2.0
max recall,0.0094738,1.0,395.0
max specificity,0.9999900,0.9991712,0.0
max absolute_mcc,0.7753258,0.8633203,127.0
max min_per_class_accuracy,0.8631340,0.9468413,95.0



Gains/Lift Table: Avg response rate: 81.50 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.2145252,1.0,1.2264913,1.2264913,0.9996427,0.9996427,0.2631133,0.2631133,22.6491293,22.6491293
,2,0.3006994,0.9999852,1.2269297,1.2266169,1.0,0.9997451,0.1057297,0.3688430,22.6929678,22.6616925
,3,0.4198141,0.9986033,1.2257454,1.2263696,0.9990347,0.9995435,0.1460043,0.5148473,22.5745383,22.6369641
,4,0.5000096,0.9961024,1.2254638,1.2262244,0.9988053,0.9994251,0.0982766,0.6131239,22.5463812,22.6224357
,5,0.6,0.9883527,1.2182297,1.2248920,0.9929092,0.9983392,0.1218113,0.7349352,21.8229717,22.4892045
,6,0.6999904,0.9673143,1.2027109,1.2217236,0.9802606,0.9957568,0.1202596,0.8551948,20.2710867,22.1723565
,7,0.8,0.7969170,1.0856412,1.2047116,0.8848438,0.9818913,0.1085745,0.9637693,8.5641168,20.4711636
,8,0.8999904,0.3014064,0.3268364,1.1071782,0.2663856,0.9023975,0.0326805,0.9964498,-67.3163616,10.7178248
,9,1.0,0.0,0.0354984,1.0,0.0289327,0.8150426,0.0035502,1.0,-96.4501556,0.0




Scoring History: 


0,1,2,3,4,5,6,7,8,9,10,11,12,13
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_lift,validation_classification_error
,2016-08-20 04:53:30,0.004 sec,0.0,,,,,,,,,,
,2016-08-20 04:53:38,8.044 sec,1.0,0.2096616,0.9455449,0.9701237,1.2287579,0.0437049,0.2649828,2.1166203,0.9037952,1.1869118,0.0741209
,2016-08-20 04:53:45,14.782 sec,2.0,0.2031399,0.7906457,0.9746515,1.2293940,0.0409426,0.2319134,0.8801287,0.9506857,1.2103230,0.0658810
,2016-08-20 04:53:52,21.509 sec,3.0,0.1957680,0.6625614,0.9785741,1.2294021,0.0378278,0.2219255,0.5486012,0.9645461,1.2167311,0.0585609
,2016-08-20 04:53:58,28.384 sec,4.0,0.1896382,0.5737970,0.9812866,1.2294021,0.0358387,0.2166006,0.3988951,0.9715216,1.2198267,0.0538277
---,---,---,---,---,---,---,---,---,---,---,---,---,---
,2016-08-20 05:07:46,14 min 16.219 sec,120.0,0.1417400,0.0800138,0.9977592,1.2294021,0.0197883,0.2033361,0.1393232,0.9864720,1.2263708,0.0418319
,2016-08-20 05:09:13,15 min 43.254 sec,132.0,0.1413923,0.0797201,0.9977777,1.2294021,0.0197060,0.2032811,0.1393118,0.9864803,1.2263218,0.0419661
,2016-08-20 05:10:49,17 min 18.714 sec,145.0,0.1410808,0.0793648,0.9978020,1.2294021,0.0197118,0.2033875,0.1384586,0.9864889,1.2265105,0.0421769



See the whole table with table.as_data_frame()

Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Kitchenware,983785.25,1.0,0.1061926
Stovetop,752343.4375000,0.7647436,0.0812101
Size,606186.9375000,0.6161781,0.0654336
Kitchen,491616.5,0.4997193,0.0530665
Dining_area,460084.4687500,0.4676676,0.0496629
---,---,---,---
Sea___Pool___Mountain___Landmark___City_view,0.0,0.0,0.0
Lake___Mountain___City___River_view,0.0,0.0,0.0
Sea___Lake___Pool___City___River_view,0.0,0.0,0.0



See the whole table with table.as_data_frame()




In [23]:
collection = mongo_collection() 
records = collection.find({'$or': [{'type': 'apartments'}, {'type': 'hotel'}]})

In [24]:
train_records, valid_records = split_records(records, 0.8)

ServerSelectionTimeoutError: localhost:27017: [WinError 10061] No connection could be made because the target machine actively refused it

In [20]:
train_features, train_y = rooms_records(train_records) 
valid_features, valid_y = rooms_records(valid_records) 

In [21]:
train_vectorizer = DictVectorizer() 
X_train = train_vectorizer.fit_transform(train_features)

ValueError: Sample sequence X is empty.

In [None]:
train = pandas.DataFrame(X_train.toarray(), columns=normalize_features(train_vectorizer.get_feature_names())) 
train['type'] = train_y  
X_valid = train_vectorizer.transform(valid_features)
valid = pandas.DataFrame(X_valid.toarray(), columns=normalize_features(train_vectorizer.get_feature_names())) 
valid['type'] = valid_y  

In [None]:
train_water = h2o.H2OFrame.from_python(train.to_dict('list'), destination_frame='train') 
valid_water = h2o.H2OFrame.from_python(valid.to_dict('list'), destination_frame='valid')

In [None]:
gbm = H2OGradientBoostingEstimator(max_depth=50, ntrees=165)
rf = H2ORandomForestEstimator(max_depth=50, ntrees=165, balance_classes=True)

In [None]:
xc = train_water.columns.copy() 
xc.remove('type')  

In [None]:
gbm.train(training_frame=train_water, validation_frame=valid_water, x=xc, y='type')
rf.train(training_frame=train_water, validation_frame=valid_water, x=xc, y='type')

In [None]:
gbm

In [None]:
rf

In [26]:
records.count()

ServerSelectionTimeoutError: localhost:27017: [WinError 10061] No connection could be made because the target machine actively refused it