# Introduction

## Final Project Submission

***
- Student Name: Adam Marianacci
- Student Pace: Flex
- Scheduled project review date/time: TBD
- Instructor Name: Mark Barbour

# Business Understanding

It is my job to help the WWFA (Water Wells For Africa) locate wells that need to be repaired in Tanzania.

# Data Understanding

# Data Preperation

In [15]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels as sm
import sklearn.preprocessing as preprocessing
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [16]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [17]:
# Importing the dataframes
df_x = pd.read_csv('data/training_set_values.csv')
df_y = pd.read_csv('data/training_set_labels.csv')

In [18]:
# Combining the 2 dataframes into 1 new dataframe
Waterwells_df = pd.concat([df_y, df_x], axis=1)

In [19]:
# Previewing the dataframe
Waterwells_df.head()

Unnamed: 0,id,status_group,id.1,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,functional,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,functional,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,functional,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,non functional,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,functional,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [20]:
# Dropping the id columns and other irrelevant columns from the dataframe
Waterwells_df = Waterwells_df.drop(['id', 'id', 'public_meeting', 'permit'], axis=1, errors='ignore')

In [21]:
# Examining the dimensions of the dataframe
Waterwells_df.shape

(59400, 38)

In [22]:
# Checking for missing values and learning about the datatypes of the columns
Waterwells_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 38 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   status_group           59400 non-null  object 
 1   amount_tsh             59400 non-null  float64
 2   date_recorded          59400 non-null  object 
 3   funder                 55765 non-null  object 
 4   gps_height             59400 non-null  int64  
 5   installer              55745 non-null  object 
 6   longitude              59400 non-null  float64
 7   latitude               59400 non-null  float64
 8   wpt_name               59400 non-null  object 
 9   num_private            59400 non-null  int64  
 10  basin                  59400 non-null  object 
 11  subvillage             59029 non-null  object 
 12  region                 59400 non-null  object 
 13  region_code            59400 non-null  int64  
 14  district_code          59400 non-null  int64  
 15  lg

In [23]:
# Defining X and y variables
y = Waterwells_df["status_group"]
X = Waterwells_df.drop("status_group", axis=1)

In [24]:
# Performing a train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

"Funder", "Installer", "subvillage", "scheme_management" and "scheme_name" all contain missing values and are all categorical.

In [25]:
# Looking at the number of missing values in each column
X_train.isna().sum()

amount_tsh                   0
date_recorded                0
funder                    2691
gps_height                   0
installer                 2700
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 282
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
recorded_by                  0
scheme_management         2922
scheme_name              21132
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_group                0
quantity                     0
quantity_group               0
source  

In [26]:
# Filling in missing values as "missing" to preserve data
X_train_fill_na = X_train.copy()
X_train_fill_na.fillna({"funder":"funder_missing", "installer":"installer_missing", "subvillage":"subvillage_missing", "public_meeting":"public_meeting_missing", "scheme_management":"scheme_management_missing", "scheme_name":"scheme_name_missing", "permit":"permit_missing"}, inplace=True)
X_train_fill_na.isna().sum()

amount_tsh               0
date_recorded            0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
num_private              0
basin                    0
subvillage               0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
recorded_by              0
scheme_management        0
scheme_name              0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_type              0
source_class             0
waterpoint_type          0
waterpoint_type_group    0
d

In [52]:
X_train_fill_na.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44550 entries, 24947 to 56422
Data columns (total 37 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             44550 non-null  float64
 1   date_recorded          44550 non-null  object 
 2   funder                 44550 non-null  object 
 3   gps_height             44550 non-null  int64  
 4   installer              44550 non-null  object 
 5   longitude              44550 non-null  float64
 6   latitude               44550 non-null  float64
 7   wpt_name               44550 non-null  object 
 8   num_private            44550 non-null  int64  
 9   basin                  44550 non-null  object 
 10  subvillage             44550 non-null  object 
 11  region                 44550 non-null  object 
 12  region_code            44550 non-null  int64  
 13  district_code          44550 non-null  int64  
 14  lga                    44550 non-null  object 
 15

In [53]:
categorical_features = Waterwells_df.select_dtypes(exclude=["int64", "float64"]).copy()
X_train_categorical = X_train_fill_na[categorical_features].copy()


ValueError: Boolean array expected for the condition, not object

In [51]:
ohe = OneHotEncoder(handle_unknown="ignore")

one_hot.fit(X_train_categorical)
X_train_ohe = pd.DataFrame(
    one_hot.transform(X_train_categorical),
    # index is important to ensure we can concatenate with other columns
    index=X_train_categorical.index,
# we are dummying multiple columns at once, so stack the names
    columns=np.hstack(ohe.categorical_features)
)
X_train_ohe

AttributeError: 'OneHotEncoder' object has no attribute 'categorical_features'

In [44]:
ohe.X_train_categorical

AttributeError: 'OneHotEncoder' object has no attribute 'X_train_categorical'

In [None]:
# Creating a heatmap from the initial dataframe
fig, ax = plt.subplots(figsize=(10,10))
cor = Waterwells_df.corr()
sns.heatmap(cor,cmap="Blues",annot=True)

# Modeling

# Evaluation

# Conclusion

# Recommendations

# Limitations

# Next Steps