In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from scipy.spatial.distance import euclidean
SEED = 10

In [2]:
# load the data from two files
dfX = pd.read_csv('../data/training_set_values.csv')
dfy = pd.read_csv('../data/training_set_labels.csv')
# concatenate the files
df = pd.concat([dfX, dfy.status_group], axis = 1)

# drop duplicates, singling out the id column
df.drop(df[df.duplicated(subset=df.columns.difference(['id']))].index, inplace=True)
# drop columns with missing values
df.dropna(axis='columns', inplace=True)
# drop id column and columns with problematic zero values
df.drop(columns=['id', 'num_private', 'construction_year', 'population'], inplace=True)
# convert region_code to string object
df.region_code = df.region_code.astype('string')
# convert district_code to string object
df.district_code = df.district_code.astype('string')
# drop columns related to lat/long and elevation (which might still prove to be useful)
df.drop(columns=['longitude', 'latitude', 'gps_height'], inplace=True)
# show row and column counts
df.drop(columns=['date_recorded'], inplace=True)
# show row and column counts


df.shape

(59364, 26)

In [3]:
labels = df.status_group
df.drop('status_group', axis=1, inplace=True)

In [4]:
# encode the status_group as 1s ('functional') and 0s ('non functional' or 'functional needs repair')
labels = labels.apply(lambda x: 1 if x == 'functional' else 0)
# # convert date_recorded to datetime object
# df.date_recorded = pd.to_datetime(df.date_recorded, format = "%Y-%m-%d")

In [5]:
# drop all categorical columns with more than 10 unique values
df.drop(columns = list(df.select_dtypes(include=['object']).loc[:, df.nunique() > 10].columns), inplace=True)
# show rows and columns
df.shape

(59364, 18)

In [6]:
# one-hot encode the categorical columns
one_hot_df = pd.get_dummies(df)
# show row and column counts
one_hot_df.shape

(59364, 96)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(one_hot_df, labels, test_size=0.25, random_state=42)

In [9]:
# calculate the mean and standard deviation for each feature within each class for the training set
train = pd.concat([X_train, y_train], axis=1)
aggs = train.groupby('status_group').agg(['mean', 'std'])
aggs

Unnamed: 0_level_0,amount_tsh,amount_tsh,basin_Internal,basin_Internal,basin_Lake Nyasa,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Tanganyika,...,waterpoint_type_group_communal standpipe,waterpoint_type_group_communal standpipe,waterpoint_type_group_dam,waterpoint_type_group_dam,waterpoint_type_group_hand pump,waterpoint_type_group_hand pump,waterpoint_type_group_improved spring,waterpoint_type_group_improved spring,waterpoint_type_group_other,waterpoint_type_group_other
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,...,mean,std,mean,std,mean,std,mean,std,mean,std
status_group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,144.774412,1103.875974,0.121936,0.327219,0.064734,0.246061,0.053116,0.22427,0.121985,0.327276,...,0.540809,0.498344,0.0,0.0,0.243773,0.429368,0.007975,0.088947,0.206409,0.404738
1,455.539562,3483.675635,0.138998,0.345951,0.103102,0.304099,0.031187,0.173826,0.096493,0.295272,...,0.617828,0.485928,0.000207,0.01437,0.336197,0.472417,0.017101,0.129651,0.026313,0.160066


In [10]:
# Your code here
from scipy import stats
def p_x_given_class(obs_row, feature, class_):
    mu = aggs[feature]['mean'][class_]
    std = aggs[feature]['std'][class_]
    
    # Observation
    obs = obs_row[feature] 
    
    p_x_given_y = stats.norm.pdf(obs, loc=mu, scale=std)
    return p_x_given_y
p_x_given_class(X_train.iloc[0], X.columns[0], 0)

NameError: name 'X' is not defined