In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from itertools import product
import math
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as accuracy
import graphviz 
from datetime import datetime, timedelta

# personal modules
import ml_pipeline_lch as ml
import ml_explore as exp
import ml_modeling as md


sns.set(style = "ticks", color_codes = True)
%matplotlib inline 

### Read and Pre-Process Data

#### Read Data

In [2]:
outcomes_df = ml.retrieve_data(filename = 'data/outcomes.csv', headers = True, set_ind = 0)

In [None]:
projects_df = ml.retrieve_data(filename = 'data/projects.csv', headers = True, set_ind = 0)

In [None]:
project_info = pd.merge(projects_df, outcomes_df, how = 'outer', right_index = True, left_index = True)

In [None]:
project_info.head()

In [None]:
project_info.date_posted.dtype

In [None]:
project_info['date_posted'] = ml.convert_dates(project_info['date_posted'])

In [None]:
project_info['year'] = project_info['date_posted'].apply(lambda x: x.year)

In [None]:
project_info['year'] = project_info[project_info['year'].isin([2011, 2012, 2013])]

In [None]:
pd.Series(project_info.columns)

In [None]:
project_info.describe().round(3).transpose()

In [None]:
# create quick reference for accessing columns
col_ref = ml.create_col_ref(project_info)
col_ref

In [None]:
# check column types
project_info.dtypes

In [None]:
maxes = ml.view_max_mins(project_info, max = True)
maxes.tail().round(2)

In [None]:
likely_outliers_upper = ml.view_likely_outliers(project_info)
likely_outliers_upper.tail()

In [None]:
mins = ml.view_max_mins(project_info, max = False)
mins

In [None]:
likely_outliers_lower = ml.view_likely_outliers(project_info, max = False)
likely_outliers_lower

In [None]:
manip_df = project_info[project_info.columns]

In [None]:
# total price and students reached values should not be inf 
ml.remove_over_under_threshold(manip_df, col = 'total_price_excluding_optional_support', min_val = 0, max_val = False, lwr_threshold = 0.001, upr_threshold = False)
ml.remove_over_under_threshold(manip_df, col = 'total_price_including_optional_support', min_val = 0, max_val = False, lwr_threshold = 0.001, upr_threshold = False)
ml.remove_over_under_threshold(manip_df, col = 'students_reached', min_val = 0, max_val = False, lwr_threshold = 0.005, upr_threshold = False)


In [None]:
# check that expected values were removed
likely_outliers_lower_post = ml.view_likely_outliers(manip_df, max = False)
likely_outliers_lower_post

In [None]:
ml.print_null_freq(manip_df)

In [None]:
def record_nulls(df):
    for col in list(df.columns):
        title = col + "_was_null"
        df[title] = df[col].isnull().astype(int)
    df = df.loc[:, (df != 0).any(axis=0)]

In [None]:
record_nulls(manip_df)

In [None]:
manip_df.head()

In [None]:
# fill null values for which zero where it is most likely true value
ml.basic_fill_vals(manip_df, "at_least_1_green_donation", method = "zeros")
ml.basic_fill_vals(manip_df, "at_least_1_teacher_referred_donor", method = "zeros")
ml.basic_fill_vals(manip_df, "donation_from_thoughtful_donor", method = "zeros")
ml.basic_fill_vals(manip_df, "fully_funded", method = "zeros")
ml.basic_fill_vals(manip_df, "great_chat", method = "zeros")
ml.basic_fill_vals(manip_df, "is_exciting", method = "zeros")
ml.basic_fill_vals(manip_df, "one_non_teacher_referred_donor_giving_100_plus", method = "zeros")
ml.basic_fill_vals(manip_df, "three_or_more_non_teacher_referred_donors", method = "zeros")



In [None]:
ml.print_null_freq(manip_df)

In [None]:
exp.view_dist(manip_df, geo_columns= True,
             fig_size = (20, 20),
             labels = ["Feature Distributions", "Feature", "Frequency"])


In [None]:
exp.view_dist(project_info, geo_columns= True,
             fig_size = (20, 20),
             labels = ["Feature Distributions", "Feature", "Frequency"])
