# Introduction

## Final Project Submission

***
- Student Name: Adam Marianacci
- Student Pace: Flex
- Scheduled project review date/time: TBD
- Instructor Name: Mark Barbour

# Business Understanding

It is my job to help the WWFA (Water Wells For Africa) locate wells that need to be repaired in Tanzania.

# Data Understanding

# Data Preperation

In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels as sm
import sklearn.preprocessing as preprocessing
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# Importing the dataframes
df_x = pd.read_csv('data/training_set_values.csv')
df_y = pd.read_csv('data/training_set_labels.csv')

In [4]:
# Combining the 2 dataframes into 1 new dataframe
Waterwells_df = pd.concat([df_y, df_x], axis=1)

In [5]:
# Previewing the dataframe
Waterwells_df.head()

Unnamed: 0,id,status_group,id.1,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,functional,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,functional,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,functional,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,non functional,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,functional,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


Dropping columns that are not directly related to the business problem and also have high cardinality, making them difficult to one hot encode.

In [6]:
# Dropping the id columns and other irrelevant columns from the dataframe
columns_to_drop = [
    'id', 'scheme_management', 'region', 'public_meeting', 'extraction_type', 'management', 
    'source_type', 'extraction_type_group', 'permit', 'funder',
    'date_recorded', 'installer', 'ward', 'scheme_name', 'wpt_name', 'lga', 'subvillage'
]

Waterwells_df = Waterwells_df.drop(columns_to_drop, axis=1, errors='ignore')


In [7]:
# Examining the dimensions of the dataframe
Waterwells_df.shape

(59400, 24)

In [8]:
# Checking for missing values and learning about the datatypes of the columns
Waterwells_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59400 entries, 0 to 59399
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   status_group           59400 non-null  object 
 1   amount_tsh             59400 non-null  float64
 2   gps_height             59400 non-null  int64  
 3   longitude              59400 non-null  float64
 4   latitude               59400 non-null  float64
 5   num_private            59400 non-null  int64  
 6   basin                  59400 non-null  object 
 7   region_code            59400 non-null  int64  
 8   district_code          59400 non-null  int64  
 9   population             59400 non-null  int64  
 10  recorded_by            59400 non-null  object 
 11  construction_year      59400 non-null  int64  
 12  extraction_type_class  59400 non-null  object 
 13  management_group       59400 non-null  object 
 14  payment                59400 non-null  object 
 15  pa

In [9]:
# Create a new column 'repair_status' by merging the two categories
Waterwells_df['repair_status'] = Waterwells_df['status_group'].replace(
    {'functional': 'does_not_need_repair', 'non functional': 'Needs Repair', 
     'functional but needs repair': 'Needs Repair'})

# Drop the original 'status_group' column
Waterwells_df.drop('status_group', axis=1, inplace=True)

#Display the updated DataFrame
Waterwells_df.head()



Unnamed: 0,amount_tsh,gps_height,longitude,latitude,num_private,basin,region_code,district_code,population,recorded_by,construction_year,extraction_type_class,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_class,waterpoint_type,waterpoint_type_group,repair_status
0,6000.0,1390,34.938093,-9.856322,0,Lake Nyasa,11,5,109,GeoData Consultants Ltd,1999,gravity,user-group,pay annually,annually,soft,good,enough,enough,spring,groundwater,communal standpipe,communal standpipe,does_not_need_repair
1,0.0,1399,34.698766,-2.147466,0,Lake Victoria,20,2,280,GeoData Consultants Ltd,2010,gravity,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,surface,communal standpipe,communal standpipe,does_not_need_repair
2,25.0,686,37.460664,-3.821329,0,Pangani,21,4,250,GeoData Consultants Ltd,2009,gravity,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,surface,communal standpipe multiple,communal standpipe,does_not_need_repair
3,0.0,263,38.486161,-11.155298,0,Ruvuma / Southern Coast,90,63,58,GeoData Consultants Ltd,1986,submersible,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,groundwater,communal standpipe multiple,communal standpipe,Needs Repair
4,0.0,0,31.130847,-1.825359,0,Lake Victoria,18,1,0,GeoData Consultants Ltd,0,gravity,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,surface,communal standpipe,communal standpipe,does_not_need_repair


In [10]:
# Defining X and y variables
y = Waterwells_df["repair_status"]
X = Waterwells_df.drop("repair_status", axis=1)

In [11]:
#Need to redefine y into a binary target array 'needs repair', 'does not need repair'

In [12]:
# Performing a train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

"scheme_management" contains missing values and is categorical.

In [13]:
# Looking at the number of missing values in each column
X_train.isna().sum()

amount_tsh               0
gps_height               0
longitude                0
latitude                 0
num_private              0
basin                    0
region_code              0
district_code            0
population               0
recorded_by              0
construction_year        0
extraction_type_class    0
management_group         0
payment                  0
payment_type             0
water_quality            0
quality_group            0
quantity                 0
quantity_group           0
source                   0
source_class             0
waterpoint_type          0
waterpoint_type_group    0
dtype: int64

In [14]:
#Defining categorical df
X_train_categorical = X_train.select_dtypes(include='object').copy()
X_train_categorical.head()


Unnamed: 0,basin,recorded_by,extraction_type_class,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_class,waterpoint_type,waterpoint_type_group
3607,Internal,GeoData Consultants Ltd,gravity,user-group,pay per bucket,per bucket,soft,good,insufficient,insufficient,spring,groundwater,communal standpipe,communal standpipe
50870,Internal,GeoData Consultants Ltd,handpump,user-group,never pay,never pay,soft,good,enough,enough,shallow well,groundwater,hand pump,hand pump
20413,Lake Rukwa,GeoData Consultants Ltd,other,user-group,never pay,never pay,soft,good,enough,enough,shallow well,groundwater,other,other
52806,Rufiji,GeoData Consultants Ltd,gravity,user-group,pay monthly,monthly,soft,good,insufficient,insufficient,river,surface,communal standpipe,communal standpipe
50091,Wami / Ruvu,GeoData Consultants Ltd,other,user-group,pay when scheme fails,on failure,salty,salty,enough,enough,shallow well,groundwater,other,other


In [15]:
#Defining numerical df
X_train_numerical = X_train.select_dtypes(include=["int64", "float64"]).copy()

In [16]:
Waterwells_df['repair_status']

0           does_not_need_repair
1           does_not_need_repair
2           does_not_need_repair
3                   Needs Repair
4           does_not_need_repair
5           does_not_need_repair
6                   Needs Repair
7                   Needs Repair
8                   Needs Repair
9           does_not_need_repair
10          does_not_need_repair
11          does_not_need_repair
12          does_not_need_repair
13          does_not_need_repair
14          does_not_need_repair
15          does_not_need_repair
16                  Needs Repair
17                  Needs Repair
18       functional needs repair
19          does_not_need_repair
20          does_not_need_repair
21          does_not_need_repair
22       functional needs repair
23          does_not_need_repair
24          does_not_need_repair
25       functional needs repair
26          does_not_need_repair
27          does_not_need_repair
28                  Needs Repair
29          does_not_need_repair
30        

In [17]:
ohe = OneHotEncoder(handle_unknown="ignore")

ohe.fit(X_train_categorical)

X_train_ohe = pd.DataFrame(
    ohe.transform(X_train_categorical),
    index=X_train_categorical.index,
    columns=np.hstack(ohe.categories_)
)
X_train_ohe

ValueError: Shape of passed values is (47520, 1), indices imply (47520, 86)

In [None]:
ohe = OneHotEncoder(handle_unknown="ignore")

ohe.fit(categorical_features)
X_train_ohe = pd.DataFrame(
    ohe.transform(categorical_features),
    # index is important to ensure we can concatenate with other columns
    index=categorical_features.index,
# we are dummying multiple columns at once, so stack the names
    columns=np.hstack(ohe.categorical_)
)
X_train_ohe

In [None]:
categorical_features.head()

In [None]:
ohe.X_train_categorical

In [None]:
# Creating a heatmap from the initial dataframe
fig, ax = plt.subplots(figsize=(10,10))
cor = Waterwells_df.corr()
sns.heatmap(cor,cmap="Blues",annot=True)

# Modeling

# Evaluation

# Conclusion

# Recommendations

# Limitations

# Next Steps