##Imports

In [1]:
!pip install pandas_profiling
!pip install category_encoders



In [0]:
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import category_encoders as ce
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [3]:
!wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/train_features.csv
!wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/train_labels.csv
!wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/test_features.csv
!wget https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/sample_submission.csv

--2019-07-23 19:10:52--  https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/train_features.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20054664 (19M) [text/plain]
Saving to: ‘train_features.csv.2’


2019-07-23 19:10:53 (117 MB/s) - ‘train_features.csv.2’ saved [20054664/20054664]

--2019-07-23 19:10:53--  https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/train_labels.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1148327 (1.1M) [text/plain]
Savin

In [0]:
import pandas_profiling as pp

##Fetch

In [0]:
#load dataframes
X = pd.read_csv('train_features.csv')
Y = pd.read_csv('train_labels.csv')

#Drop id's
Y = Y.status_group

In [0]:
#pp.ProfileReport(X)

##Baseline

In [7]:
#Split data
X_train, X_val, y_train, y_val = train_test_split(X, Y,random_state = 42,stratify = Y)
    
#Baseline
y_train.value_counts(normalize = True)

functional                 0.543075
non functional             0.384242
functional needs repair    0.072682
Name: status_group, dtype: float64

In [8]:
#get nums from df
numericals = X.select_dtypes('number').columns.to_list()
print(numericals)

['id', 'amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'region_code', 'district_code', 'population', 'construction_year']


In [9]:
model = LogisticRegression(solver = 'lbfgs', multi_class = 'auto', max_iter = 350)
model.fit(X_train[numericals],y_train)
model.score(X_val[numericals], y_val)



0.5484848484848485

In [10]:
#plot coefficent values for each label
fig = plt.figure(figsize = (30,5), )

ax = fig.add_subplot(131)
coeffs = pd.Series(model.coef_[0], numericals)
ax = coeffs.sort_values().plot.barh()
ax.set_title(model.classes_[0], color = 'g')

ax1 = fig.add_subplot(132)
coeffs = pd.Series(model.coef_[1], numericals)
ax1 = coeffs.sort_values().plot.barh()
ax1.set_title(model.classes_[1], color = 'y')

ax2 = fig.add_subplot(133)
coeffs = pd.Series(model.coef_[2], numericals)
ax2 = coeffs.sort_values().plot.barh()
ax2.set_title(model.classes_[2], color = 'r')


Text(0.5, 1.0, 'non functional')

It looks like longitude and region code have some of the largest effects on the model based on the coefficients

##Cleaning


In [11]:
#Check if there are any coordinates in data that are outside of tanzania

#28 is west most and 42 is east most boundary  
print(X[(X.longitude <28) | (X.longitude > 42)].shape[0])

#0 is top most and -12 is lowest boundary
print(X[(X.latitude > 0) | (X.latitude < -12)].shape[0]) 

1812
0


In [0]:
def get_elevations(x):
#store each region and gps height in dict
  elevations = {}
  for region in x.region.unique():
    elevations[region] = x[x.region == region]['gps_height'].median()

#These looks to be the problem areas. The median values for these regions should not be zeo
  zero_regions = ['Dodoma', 'Kagera', 'Mbeya', 'Mwanza', 'Shinyanga', 'Tabora']

#I looked up the elevations and imputed the data
  elevations['Dodoma'] = 1118
  elevations['Kagera'] = 1500
  elevations['Mbeya'] = 1700
  elevations['Mwanza'] = 1140
  elevations['Shinyanga'] = 1128
  elevations['Tabora'] = 1191

#Use new values to clean zeroes in df based on imputed values
  for key in elevations:
    if key in zero_regions:
      x.loc[x.region == key, 'gps_height'] = elevations[key]
  
  return x

In [0]:
def get_dates(x):
  #reassgin as datetime object
  x.date_recorded = pd.to_datetime(x.date_recorded,
                                   infer_datetime_format= True)
  
  #split values and add to df
  x['year'] = x.date_recorded.dt.year
  x['month'] = x.date_recorded.dt.month
  x['day_of_week'] = x.date_recorded.dt.dayofweek
  
  #drop dates(not working with standard scaler?)
  x = x.drop('date_recorded', axis = 1)
  
  return x

In [0]:
def clean(df):
  x = df
  
#Clean zeroes, I will infer zero equate to missing data that cant be imputed
  zeroes = ['construction_year', 'longitude', 'population']
  for col in zeroes:
    x[col] = x[col].replace(0, np.nan)  

#eliminate null island values
  x['latitude'] = x['latitude'].replace(-2e-08, np.nan)
  x['longitude'] = x['longitude'].replace(-2e-08, np.nan)

#For consistency, replace latitude values with NaN where longitude contains NaN
  x['latitude'] = np.where(x.longitude.isnull(), np.NaN, x.latitude)
  
#reassign date values
  x = get_dates(x)
  
#impute gps data
  x = get_elevations(x)

#drop items both high card and duplicates
  dropping = ['quantity_group', 'recorded_by','subvillage', 'wpt_name',
              'management_group']  
  x = x.drop(dropping, axis = 1)

#reassign bool values
  x['permit'] = np.where(x.permit == True , 1,0)
  x['public_meeting'] = np.where(x.public_meeting == True , 1,0)

  return x


In [0]:
#clean, encode, and scale data
def transform(df):
  X = clean(df)
  
    #encode objects
  for col in X.select_dtypes('object').columns.tolist():
    #One hot encode low card data
    if len(X[col].unique()) <=10:
      encode = ce.OneHotEncoder()
      X = X.join(encode.fit_transform(X[col].astype('str')))
      X = X.drop(col, axis = 1)
    #Label encode high card data  
    else:
      le = LabelEncoder()
      X[col] = le.fit_transform(X[col].astype('str')) 

  #Scale data
  X = StandardScaler().fit_transform(X)
  
  return X

In [0]:
def dropNA(X, **kwargs):
  #if a target vector is supplied, store a copy in Y
  if 'y' in kwargs:
    Y = kwargs['y'].copy()
    
    #Combine data for a consistent drop
    X = X.join(Y)

    #drop NANs
    X = X.dropna()

    #Split data
    Y = X.pop('status_group')
  
    return X,Y
  else:
    X = X.dropna()
    
    return X

##Preprocessing & Preliminary Models

In [0]:
def split_data(X,Y):  
  #Split data
  X_train, X_val, y_train, y_val = train_test_split(X, Y,
                                              random_state = 42,stratify = Y)
  
  return [X_train, X_val, y_train, y_val]

In [0]:
#Pipeline to encode, scale, split 
#if no kwargs are supplied, will only transform data
#y = target vector - is optional but required to run split function
#dropNA = True - dropNA for df or both df and y
#split_data = True - perform train test split and return list of split data

def pipeline(df, **kwargs):
  target_supplied = False
  
  #copy and transform data
  x = pd.DataFrame(transform(df.copy()))
  
  #check kwargs data for functions inputs
  function = {'dropNA': False, 'split_data': False}
  for key in function:
    if key in kwargs and kwargs[key]:
      function[key] = True
  
  #if a target vector is supplied, store a copy in Y
  if 'y' in kwargs:
    Y = kwargs['y'].copy()
    target_supplied = True
    
  #if dropNA is requested, run function dropNA with target vector
  if function['dropNA'] and target_supplied:
    x,Y = dropNA(x, y = Y)
  #if dropNA is requested with no target data, just update X
  elif function['dropNA'] and not target_supplied:
    x = dropNA(x)
    
  #if split data is requested, run function split_data(requires target to be true)
  if function['split_data'] and target_supplied:
    return split_data(x,Y)
  
  if target_supplied:
    return x,Y
  else:
    return x

In [0]:
#test and score each model
def model_score(model, data):
  X_train = data[0] 
  X_val = data[1]
  y_train = data[2]
  y_val = data[3]
  
  model.fit(X_train,y_train)
  y_pred = model.predict(X_val)
  
  return accuracy_score(y_val, y_pred)

In [41]:
#run dfs through pipeline
data = pipeline(X, y = Y, dropNA = True, split_data = True)

#instantiate models
dt = DecisionTreeClassifier()
gbm = GradientBoostingClassifier()
knn = KNeighborsClassifier()
rfc = RandomForestClassifier()

#get and print scores
print(f'Tree - {model_score(dt, data)}')
print(f'Boost - {model_score(gbm, data)}')
print(f'Neighbor - {model_score(knn, data)}')
print(f'Forest - {model_score(rfc, data)}')

Tree - 0.7436803770351328
Boost - 0.7674592973436161
Neighbor - 0.7737789203084833




Forest - 0.797343616109683


Its looks like Random Forest will give us a good score right from the start. 

##Feature Engineering

##Submission


In [0]:
#fetch test data for submission
test = pd.read_csv('test_features.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [0]:
#clean and preprocess data
test = pipeline(test, dropNA = True)
X_pip, Y = pipeline(X, y = Y, dropNA = True)

In [0]:
forest = RandomForestClassifier(n_estimators=100)

forest.fit(X_pip, Y)
y_pred = forest.predict(test)

In [0]:
from google.colab import files

submission = sample_submission.copy()
submission['status_group'] = y_pred
submission.to_csv('submission-01.csv', index=False)


files.download('submission-01.csv')