In [1]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

pd.set_option('max_columns', None)

## Common Code

In [2]:
X = pd.read_csv('../../data/training_set_values.csv')
y = pd.read_csv('../../data/training_set_labels.csv')

In [3]:
X_test = pd.read_csv('../../data/test_set_values.csv')
y_test = pd.read_csv('../../data/SubmissionFormat.csv')

In [4]:
y_test.head()

Unnamed: 0,id,status_group
0,50785,predicted label
1,51630,predicted label
2,17168,predicted label
3,45559,predicted label
4,49871,predicted label


In [5]:
def data_preprocess(df):
    return df

In [6]:
def generate_next_submission_fileid():
    files_found = []
    for file in os.listdir("../../data"):
        if file.startswith("water_pump_submission"):
            files_found.append(file[22:24])
    return f'{int(sorted(files_found).pop()) + 1 :02}'

In [7]:
def create_submission_file(pipeline, filename_comment):
    next_file_id = generate_next_submission_fileid()
    X_test_processed = data_preprocess(X_test)
    y_submit_pred = pipeline.predict(X_test_processed)
    y_test['status_group'] = y_submit_pred
    filename = f'../../data/water_pump_submission_{next_file_id}_{filename_comment}.csv'
    y_test.to_csv(filename, index = False)
    
    return y_submit_pred, filename

In [8]:
df = X.merge(y, left_on = 'id', right_on = 'id')
df.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


### Common Pipeline prep

In [9]:
num_cols = X.select_dtypes('number').columns.tolist()
num_cols.pop(0)
num_cols

['amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'num_private',
 'region_code',
 'district_code',
 'population',
 'construction_year']

In [10]:
cat_cols_raw = X.select_dtypes('object').columns.tolist()
cat_cols = [a for a in cat_cols_raw if len(X[a].unique()) <= 25]
display(cat_cols)
cat_cols_2 = cat_cols[:1]
display(cat_cols_2)

['basin',
 'region',
 'public_meeting',
 'recorded_by',
 'scheme_management',
 'permit',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group']

['basin']

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [12]:
y_train.isnull().sum()

id              0
status_group    0
dtype: int64

In [18]:
model = RandomForestClassifier(random_state=42)

In [19]:
num_pipe = Pipeline([('imputer', SimpleImputer(strategy = 'median')),
                     ('scaler', StandardScaler())])

cat_pipe = Pipeline([('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
                     ('encoder', OneHotEncoder(handle_unknown = 'ignore', sparse = False))])

### Full Pipe

In [21]:
preprocessor = ColumnTransformer(transformers = [('cat', cat_pipe, cat_cols_2),
                                                 ('num', num_pipe, num_cols)],
                                remainder = 'drop')

In [22]:
pipe = Pipeline([('preprocessor', preprocessor),
                 ('model', model)])

In [23]:
pipe.fit(X_train, y_train)

TypeError: '<' not supported between instances of 'str' and 'int'

### Full Pipe troubleshooting

In [24]:
categorical_cols = preprocessor.named_transformers_['cat']['encoder'].get_feature_names(cat_cols_2)
columns = np.append(categorical_cols, num_cols)

In [25]:
df_test = pd.DataFrame(preprocessor.transform(X_train),columns=columns)
display(df_test)
df_test.isnull().sum()

Unnamed: 0,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.084999,2.053863,0.204135,0.501977,-0.03807,0.322017,-0.480201,-0.041306,0.730627
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.100621,-0.965049,0.216991,-0.006643,-0.03807,-0.813147,0.037334,-0.379739,-1.370863
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.100621,-0.965049,-0.243403,-1.147020,-0.03807,-0.188807,0.037334,-0.379739,-1.370863
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.100621,-0.965049,-0.004716,-1.061741,-0.03807,-0.188807,0.140841,-0.379739,-1.370863
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.006889,0.511216,0.449866,-0.114078,-0.03807,-0.586114,-0.480201,-0.125914,0.729576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47515,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.211821,-0.493164,0.348070,-1.042904,-0.03807,-0.586114,-0.169680,0.159638,0.727472
47516,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.211821,1.550230,0.067327,-1.380887,-0.03807,-0.245565,-0.066173,-0.305707,0.720109
47517,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.100621,-0.983809,0.746842,0.096546,-0.03807,-0.642872,-0.066173,1.735464,0.737990
47518,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.100621,-0.965049,0.034454,0.882577,-0.03807,0.094984,0.037334,-0.379739,-1.370863


basin_Internal                   0
basin_Lake Nyasa                 0
basin_Lake Rukwa                 0
basin_Lake Tanganyika            0
basin_Lake Victoria              0
basin_Pangani                    0
basin_Rufiji                     0
basin_Ruvuma / Southern Coast    0
basin_Wami / Ruvu                0
amount_tsh                       0
gps_height                       0
longitude                        0
latitude                         0
num_private                      0
region_code                      0
district_code                    0
population                       0
construction_year                0
dtype: int64

### Num Pipe only

In [26]:
preprocessor_num = ColumnTransformer(transformers = [('num', num_pipe, num_cols)],
                                     remainder = 'drop')

In [27]:
preprocessor_num.fit(X_train);

In [28]:
num_columns = preprocessor_num.named_transformers_['num']['scaler'].mean_
num_columns

array([ 3.22047573e+02,  6.68745370e+02,  3.40913165e+01, -5.70500228e+00,
        5.04566498e-01,  1.53265152e+01,  5.63930976e+00,  1.79528283e+02,
        1.30335320e+03])

In [29]:
df_num = pd.DataFrame(preprocessor_num.transform(X_train), columns=num_cols)
df_num

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
0,-0.084999,2.053863,0.204135,0.501977,-0.03807,0.322017,-0.480201,-0.041306,0.730627
1,-0.100621,-0.965049,0.216991,-0.006643,-0.03807,-0.813147,0.037334,-0.379739,-1.370863
2,-0.100621,-0.965049,-0.243403,-1.147020,-0.03807,-0.188807,0.037334,-0.379739,-1.370863
3,-0.100621,-0.965049,-0.004716,-1.061741,-0.03807,-0.188807,0.140841,-0.379739,-1.370863
4,-0.006889,0.511216,0.449866,-0.114078,-0.03807,-0.586114,-0.480201,-0.125914,0.729576
...,...,...,...,...,...,...,...,...,...
47515,0.211821,-0.493164,0.348070,-1.042904,-0.03807,-0.586114,-0.169680,0.159638,0.727472
47516,0.211821,1.550230,0.067327,-1.380887,-0.03807,-0.245565,-0.066173,-0.305707,0.720109
47517,-0.100621,-0.983809,0.746842,0.096546,-0.03807,-0.642872,-0.066173,1.735464,0.737990
47518,-0.100621,-0.965049,0.034454,0.882577,-0.03807,0.094984,0.037334,-0.379739,-1.370863


In [30]:
df_num.isnull().sum()

amount_tsh           0
gps_height           0
longitude            0
latitude             0
num_private          0
region_code          0
district_code        0
population           0
construction_year    0
dtype: int64

In [31]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47520 entries, 0 to 47519
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   amount_tsh         47520 non-null  float64
 1   gps_height         47520 non-null  float64
 2   longitude          47520 non-null  float64
 3   latitude           47520 non-null  float64
 4   num_private        47520 non-null  float64
 5   region_code        47520 non-null  float64
 6   district_code      47520 non-null  float64
 7   population         47520 non-null  float64
 8   construction_year  47520 non-null  float64
dtypes: float64(9)
memory usage: 3.3 MB


No nulls, going ahead with num model only

In [32]:
pipe_num = Pipeline([('preprocessor_num', preprocessor_num),
                 ('model', model)])

In [33]:
pipe_num.fit(X_train, y_train)

TypeError: '<' not supported between instances of 'str' and 'int'

### Cat pipe only

In [34]:
preprocessor_cat = ColumnTransformer(transformers = [('cat', cat_pipe, cat_cols_2)],
                                     remainder = 'drop')

In [35]:
preprocessor_cat.fit(X_train);

In [36]:
categorical = preprocessor_cat.named_transformers_['cat']['encoder'].get_feature_names(cat_cols_2)
categorical

array(['basin_Internal', 'basin_Lake Nyasa', 'basin_Lake Rukwa',
       'basin_Lake Tanganyika', 'basin_Lake Victoria', 'basin_Pangani',
       'basin_Rufiji', 'basin_Ruvuma / Southern Coast',
       'basin_Wami / Ruvu'], dtype=object)

In [37]:
preprocessor_cat.fit(X_train)
df_cat = pd.DataFrame(preprocessor_cat.transform(X_train), columns=categorical)
df_cat

Unnamed: 0,basin_Internal,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
47515,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
47516,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47517,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
47518,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [38]:
df_cat.isnull().sum()

basin_Internal                   0
basin_Lake Nyasa                 0
basin_Lake Rukwa                 0
basin_Lake Tanganyika            0
basin_Lake Victoria              0
basin_Pangani                    0
basin_Rufiji                     0
basin_Ruvuma / Southern Coast    0
basin_Wami / Ruvu                0
dtype: int64

No nulls, procedding with cat only model

In [39]:
pipe_cat = Pipeline([('preprocessor_cat', preprocessor_cat),
                 ('model', model)])

In [40]:
pipe_cat.fit(X_train, y_train)

TypeError: '<' not supported between instances of 'str' and 'int'