# Machine Learning with Yellowbrick Visualizations
Author: Zach Fernandes
Date 6/11/18
Desc: This does some more machine learning with the med student data by reclassifying the data. (The target will now include low pass grades in addition to fails)

## Part One: File Setup

### Import Data and Plotting Packages

In [1]:
import numpy    as np
import pandas   as pd
import re
import os
import sys
import seaborn as sns 
import yellowbrick as yb
import matplotlib.pyplot as plt 
%matplotlib inline
import warnings

### Import Custom Modules

In [2]:
wd = os.getcwd()
mod_dir     = wd + '/../custom_modules/'
sys.path.insert(0, mod_dir)
import cleaning_helpers as helpers

### Change those annoying Pandas Defaults

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
warnings.filterwarnings('ignore')

## Part Two: Import and Transform Data

### Read in the Data

In [4]:
path = wd + '/../../../med_school_data/output/'
file_name = 'master.csv'

def load_data(csv, path):
	df = pd.read_csv(path + csv)
	return df

data = load_data(file_name, path)
data.head()

Unnamed: 0,student_id,m1_fall,mcat_zscore,mcat_total_attempts,bachelor_1,bachelor_2,bachelor_3,master_1,master_2,master_3,associate_1,associate_2,biochem_likelyhood,gem_indicator,race,race_indic,gender,m1f_limbs,m1f_metabolism_nutrition_and_endo,m1f_molecular_and_cell_physiology,m1f_molecular_and_human_genetics,m1s_cardio_pulmonary,m1s_ebm_and_population_health,m1s_gastrointestinal,m1s_head_neck_and_special_senses,m1s_medical_neuroscience,m1s_patients_populations_and_policy,m1s_physical_diagnosis_i,m1s_renal_and_electrolytes,m1s_sexual_dev__and_reproduction,m2s_clinical_skills_primer,m2s_evidence_based_medicine_ii,m2s_health_care_ethics,m2s_human_sexuality,m2s_lab_medicine_pblm_solving_case,m2s_microbiology_and_immunology,m2s_pathology,m2s_pharmacology,m2s_physical_diagnosis_ii,m2s_psychiatry,step1_raw_score,step1_z_score,step1_pass_indicator,repeat_indic,dropout_indic,step1_total_attempts,target_indicator,double_bachelor,master_degree,double_master,associate_degree,science_undergrad,science_master
0,30705306,200930,2.505529,1.0,Science,,,Science,,,,,4.0,0.0,White,5.0,M,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,,1.0,4.0,4.0,3.0,2.0,4.0,3.0,4.0,4.0,4.0,4.0,1.0,3.0,269.0,2.17888,1.0,0.0,0.0,1.0,0,0,1,0,0,1,1
1,22721624,200930,-1.407328,2.0,Science,,,Science,,,Science,,1.0,1.0,Black or African American,3.0,F,2.0,3.0,2.0,2.0,4.0,2.0,4.0,2.0,2.0,,1.0,4.0,3.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,210.0,-0.836599,1.0,0.0,0.0,1.0,0,0,1,0,1,1,1
2,35604291,200930,0.745614,1.0,Arts,,,,,,,,3.0,0.0,White,5.0,M,2.0,2.0,3.0,2.0,2.0,4.0,2.0,2.0,3.0,,1.0,2.0,3.0,3.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0,1.0,2.0,241.0,0.747805,1.0,0.0,0.0,1.0,0,0,0,0,0,0,0
3,23250759,200930,0.149933,1.0,Science,,,,,,,,4.0,0.0,White,5.0,M,2.0,2.0,2.0,2.0,3.0,3.0,3.0,4.0,3.0,,1.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,2.0,224.0,-0.121062,1.0,0.0,0.0,1.0,0,0,0,0,0,1,0
4,23269092,200930,-0.973841,1.0,Science,,,,,,,,4.0,1.0,Asian,2.0,F,2.0,2.0,2.0,2.0,,2.0,,2.0,2.0,,1.0,,2.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,197.0,-1.501027,1.0,0.0,0.0,1.0,0,0,0,0,0,1,0


### Write Transformers

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

class EncodeCategorical(BaseEstimator, TransformerMixin):
    """
    Encodes a specified list of columns or all columns if None. 
    """
    
    def __init__(self, columns=None):
        self.columns  = [col for col in columns] 
        self.encoders = None
    
    def fit(self, data, target=None):
        """
        Expects a data frame with named columns to encode. 
        """
        # Encode all columns if columns is None
        if self.columns is None:
            self.columns = data.columns 
        
        # Fit a label encoder for each column in the data frame
        self.encoders = {
            column: LabelEncoder().fit(data[column])
            for column in self.columns 
        }
        return self

    def transform(self, data):
        """
        Uses the encoders to transform a data frame. 
        """
        output = data.copy()
        for column, encoder in self.encoders.items():
            output[column] = encoder.transform(data[column])
        
        return output

In [18]:
data.head()


Unnamed: 0,student_id,m1_fall,mcat_zscore,mcat_total_attempts,bachelor_1,bachelor_2,bachelor_3,master_1,master_2,master_3,associate_1,associate_2,biochem_likelyhood,gem_indicator,race,race_indic,gender,m1f_limbs,m1f_metabolism_nutrition_and_endo,m1f_molecular_and_cell_physiology,m1f_molecular_and_human_genetics,m1s_cardio_pulmonary,m1s_ebm_and_population_health,m1s_gastrointestinal,m1s_head_neck_and_special_senses,m1s_medical_neuroscience,m1s_patients_populations_and_policy,m1s_physical_diagnosis_i,m1s_renal_and_electrolytes,m1s_sexual_dev__and_reproduction,m2s_clinical_skills_primer,m2s_evidence_based_medicine_ii,m2s_health_care_ethics,m2s_human_sexuality,m2s_lab_medicine_pblm_solving_case,m2s_microbiology_and_immunology,m2s_pathology,m2s_pharmacology,m2s_physical_diagnosis_ii,m2s_psychiatry,step1_raw_score,step1_z_score,step1_pass_indicator,repeat_indic,dropout_indic,step1_total_attempts,target_indicator,double_bachelor,master_degree,double_master,associate_degree,science_undergrad,science_master
0,30705306,200930,2.505529,1.0,Science,,,Science,,,,,4.0,0.0,White,5.0,M,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,,1.0,4.0,4.0,3.0,2.0,4.0,3.0,4.0,4.0,4.0,4.0,1.0,3.0,269.0,2.17888,1.0,0.0,0.0,1.0,0,0,1,0,0,1,1
1,22721624,200930,-1.407328,2.0,Science,,,Science,,,Science,,1.0,1.0,Black or African American,3.0,F,2.0,3.0,2.0,2.0,4.0,2.0,4.0,2.0,2.0,,1.0,4.0,3.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,210.0,-0.836599,1.0,0.0,0.0,1.0,0,0,1,0,1,1,1
2,35604291,200930,0.745614,1.0,Arts,,,,,,,,3.0,0.0,White,5.0,M,2.0,2.0,3.0,2.0,2.0,4.0,2.0,2.0,3.0,,1.0,2.0,3.0,3.0,3.0,3.0,4.0,4.0,2.0,2.0,2.0,1.0,2.0,241.0,0.747805,1.0,0.0,0.0,1.0,0,0,0,0,0,0,0
3,23250759,200930,0.149933,1.0,Science,,,,,,,,4.0,0.0,White,5.0,M,2.0,2.0,2.0,2.0,3.0,3.0,3.0,4.0,3.0,,1.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,1.0,2.0,224.0,-0.121062,1.0,0.0,0.0,1.0,0,0,0,0,0,1,0
4,23269092,200930,-0.973841,1.0,Science,,,,,,,,4.0,1.0,Asian,2.0,F,2.0,2.0,2.0,2.0,,2.0,,2.0,2.0,,1.0,,2.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0,197.0,-1.501027,1.0,0.0,0.0,1.0,0,0,0,0,0,1,0


In [34]:
test = LabelEncoder().fit_transform(data['mcat_zscore'])

In [35]:
test

array([187,  38, 125, ..., 284, 375, 450])

53

53