In [33]:
from dask_ml.preprocessing import OneHotEncoder, PolynomialFeatures, OrdinalEncoder
from dask_ml.model_selection import train_test_split
from dask_ml.linear_model import LogisticRegression

import pandas as pd

In [41]:
import dask.dataframe as dd
import numpy as np


X = dd.read_csv("train_features.csv", header=0).set_index('id')
y = dd.read_csv("train_labels.csv", header=0).set_index('id').categorize()
X['status_group'] = y["status_group"]

X["date_recorded"] = X["date_recorded"].astype('M8[ns]').astype('int64')

X = X.drop(["district_code", "region", "subvillage", "region_code", "lga", "ward"], axis=1)

X[[c for c in X.columns if X[c].dtype == 'object']] = X[[c for c in X.columns if X[c].dtype == 'object']].astype(str).fillna("NAN")

X['longitude'] = X['longitude'].apply(lambda x: np.nan if x == 0.0 else x)
X['longitude_bool'] = (X["longitude"] == np.nan).astype(int)

X['latitude'] = X['latitude'].apply(lambda x: np.nan if x == -2.000000e-08 else x)
X['latitude_bool'] = (X["longitude"] == np.nan).astype(int)

X['num_private'] = X['num_private'].apply(lambda x: np.nan if x == 0.0 else x)
X["num_private_bool"] = (X["num_private"] == np.nan).astype(int)

X['construction_year'] = X['construction_year'].apply(lambda x: np.nan if x == 0.0 else x)
X["construction_year_bool"] = (X["construction_year"] == np.nan).astype(int)

X['population'] = X['population'].apply(lambda x: np.nan if x == 0.08 else x)
X["population_bool"] = (X["population"] == np.nan).astype(int)

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


In [42]:
print(X.columns)

Index(['amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer',
       'longitude', 'latitude', 'wpt_name', 'num_private', 'basin',
       'population', 'public_meeting', 'recorded_by', 'scheme_management',
       'scheme_name', 'permit', 'construction_year', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group',
       'status_group', 'longitude_bool', 'latitude_bool', 'num_private_bool',
       'construction_year_bool', 'population_bool'],
      dtype='object')


In [43]:
X.head(50)

Unnamed: 0_level_0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group,longitude_bool,latitude_bool,num_private_bool,construction_year_bool,population_bool
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,1352764800000000000,Tasaf,0,TASAF,33.125828,-5.118154,Mratibu,,Lake Tanganyika,...,shallow well,groundwater,hand pump,hand pump,non functional,0,0,0,0,0
1,0.0,1299283200000000000,Shipo,1978,SHIPO,34.770717,-9.395642,none,,Rufiji,...,shallow well,groundwater,hand pump,hand pump,functional,0,0,0,0,0
2,0.0,1301184000000000000,Lvia,0,LVIA,36.115056,-6.279268,Bombani,,Wami / Ruvu,...,borehole,groundwater,communal standpipe multiple,communal standpipe,functional,0,0,0,0,0
3,10.0,1370217600000000000,Germany Republi,1639,CES,37.147432,-3.187555,Area 7 Namba 5,,Pangani,...,spring,groundwater,communal standpipe,communal standpipe,functional,0,0,0,0,0
4,0.0,1300752000000000000,Cmsr,0,CMSR,36.164893,-6.099289,Ezeleda,,Wami / Ruvu,...,shallow well,groundwater,hand pump,hand pump,non functional,0,0,0,0,0
5,50.0,1298678400000000000,Private,28,Private,39.286124,-6.972403,Kwa Namaj,,Wami / Ruvu,...,borehole,groundwater,communal standpipe multiple,communal standpipe,functional,0,0,0,0,0
6,0.0,1350691200000000000,Government Of Tanzania,0,Government,33.22988,-3.852983,Mission,,Internal,...,shallow well,groundwater,other,other,non functional,0,0,0,0,0
7,0.0,1300838400000000000,Water,0,Gover,36.313619,-6.719257,Itawi,,Rufiji,...,spring,groundwater,improved spring,improved spring,functional,0,0,0,0,0
8,0.0,1302134400000000000,Water,0,Commu,35.939445,-6.014358,Kwa Chiswagala,,Wami / Ruvu,...,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional,0,0,0,0,0
9,0.0,1310428800000000000,Ded,0,DED,31.693371,-2.530703,Shule Ya Msingi Nyamirembe A,,Lake Victoria,...,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional,0,0,0,0,0


In [45]:
X_train, X_test = X.random_split([0.8, 0.2])
y_train = X_train["status_group"]
y_test = X_test["status_group"]
X_train = X_train.drop(["status_group"], axis=1)
X_test = X_test.drop(["status_group"], axis=1)

numcols = [c for c in X_train.columns if X_train[c].dtype != 'object' and c not in ["latitude", "longitude"] and c.split("_")[-1] != "bool"]
invcols = [c for c in X_train.columns if X_train[c].dtype != 'object' and (X_train[c] == 0).sum() == 0]
cols = [c for c in X_train.columns if X_train[c].dtype == 'object' and len(X_train[c].astype('category').cat.as_known()) < 250]

X_test['latitude'] = X_test['latitude'].fillna(latitude_gm)
X_test['latitude'] = X_test['latitude'].fillna(latitude_m)

num_private_m = X_train['num_private'].mean()
X_train['num_private'] = X_train['num_private'].fillna(num_private_m)
X_test['num_private'] = X_test['num_private'].fillna(num_private_m)

construction_year_m = X_train['construction_year'].mean()
X_train['construction_year'] = X_train['construction_year'].fillna(construction_year_m)
X_test['construction_year'] = X_test['construction_year'].fillna(construction_year_m)

population_m = X_train['population'].mean()
X_train['population'] = X_train['population'].fillna(population_m)
X_test['population'] = X_test['population'].fillna(population_m)

dropcols = [c for c in X_train if X_train[c].dtype == 'object']
        
o = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(X_train[cols])
X_train = pd.concat([X_train.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_train[cols]), columns=o.get_feature_names(cols), index=X_train.index)], axis=1)
X_test = pd.concat([X_test.drop(columns=dropcols, axis=1), pd.DataFrame(o.transform(X_test[cols]), columns=o.get_feature_names(cols), index=X_test.index)], axis=1)

for c in invcols:
    X_train["{}(inv)".format(c)] = 1.0 / X_train[c]
    X_test["{}(inv)".format(c)] = 1.0 / X_test[c]
    
"""k = KBinsDiscretizer(n_bins=10, encode="onehot-dense", strategy="uniform").fit(X_train[numcols])
kcols = ["{}[{}]".format(c,i) for ind,c in enumerate(numcols) for i in range(k.n_bins_[ind])]
X_train = pd.concat([X_train, pd.DataFrame(k.transform(X_train[numcols]), columns=kcols, index=X_train.index)], axis=1)
X_test = pd.concat([X_test, pd.DataFrame(k.transform(X_test[numcols]), columns=kcols, index=X_test.index)], axis=1)

k_geo = KBinsDiscretizer(n_bins=100, encode="onehot-dense", strategy="uniform").fit(X_train["longitude", "latitude"])
k_geocols = ["{}[{}]".format(c,i) for ind,c in enumerate(numcols) for i in range(k.n_bins_[ind])]
X_train = pd.concat([X_train, pd.DataFrame(k.transform(X_train["longitude", "latitude"]), columns=k_geocols, index=X_train.index)], axis=1)
X_test = pd.concat([X_test, pd.DataFrame(k.transform(X_test["longitude", "latitude"]), columns=k_geocols, index=X_test.index)], axis=1)
"""

X = X.drop(["longitude", "latitude"], axis=1)

poly = PolynomialFeatures(degree=2).fit(X_train)
X_train = poly.transform(X_train)
X_test = poly.transform(X_test)

clf = LogisticRegression(verbose=1, n_jobs=-1, random_state=42).fit(X_train, y_tr_c)
print(clf.score(X_test, y_te_c))

NameError: name 'latitude_gm' is not defined