In [None]:
import pandas as pd
import os
import git
import numpy as np
import seaborn as sns
import matplotlib as plt
import time

def get_git_root(path):

        git_repo = git.Repo(path, search_parent_directories=True)
        #git_root = git_repo.git.rev_parse("--show-toplevel")
        
        return git_repo.working_dir

In [None]:
top_level_git_dir = get_git_root(os.getcwd())
raw_data_dir = os.path.join(top_level_git_dir, "data", "raw")

train_csv_path = os.path.join(raw_data_dir, "train_values.csv")
test_csv_path = os.path.join(raw_data_dir, "test_values.csv")
train_labels_csv_path = os.path.join(raw_data_dir, "train_labels.csv")
submission_format_csv_path = os.path.join(raw_data_dir, "submission_format.csv")

train_df = pd.read_csv(train_csv_path, index_col = "row_id")
train_labels_df = pd.read_csv(train_labels_csv_path, index_col = "process_id")
test_df = pd.read_csv(test_csv_path, index_col = "row_id")
submission_format_csv_path = pd.read_csv(submission_format_csv_path, index_col = "process_id")


In [None]:
response_dict = {}
for proc_id in train_df.process_id.unique():
    final_phase_for_proc = train_df[(train_df.process_id == proc_id) & (train_df.target_time_period == True)]
    final_phase_for_proc = final_phase_for_proc.assign(final_phase_turbidity = \
        np.maximum(0, final_phase_for_proc.return_flow) * final_phase_for_proc.return_turbidity)
    proc_id_response = final_phase_for_proc["final_phase_turbidity"].sum()
    
    response_dict[proc_id] = proc_id_response

In [None]:
response_dict

In [None]:
def prep_full_df(df):

    df["timestamp"] = pd.to_datetime(df["timestamp"])
    
    df = df.assign(turbidity_in_liters = \
        np.maximum(0, df.return_flow) * df.return_turbidity)

    df['process_phase'] = df.process_id.astype(str) + "_" + df.phase.astype(str)
    df = df[df.phase != "final_rinse"]
    
    return df

In [None]:
def prep_metadata(df):
    meta_df = df[["process_id", "pipeline"]].drop_duplicates().set_index("process_id")
    meta_df = pd.get_dummies(meta_df)
    
    if 'L12' not in meta_df.columns:
        meta_df['pipeline_L12'] = False
    
    for col in meta_df.columns:
        if "pipeline" in col:
            meta_df[col] = meta_df[col].astype(bool)
    meta_df["num_phases"] = df.groupby("process_id")["phase"].apply(lambda x: x.nunique())
    return meta_df


In [None]:
ts_cols = [
    'process_id',
    'timestamp',
    'supply_flow',
    'supply_pressure',
    'return_temperature',
    'return_conductivity',
    'return_turbidity',
    'return_flow',
    'tank_level_pre_rinse',
    'tank_level_caustic',
    'tank_level_acid',
    'tank_level_clean_water',
    'tank_temperature_pre_rinse',
    'tank_temperature_caustic',
    'tank_temperature_acid',
    'tank_concentration_caustic',
    'tank_concentration_acid',
    "turbidity_in_liters"
]

def prep_time_series_features(df, columns = None):
    
    if columns is None:
        columns = df.columns
    
    df = df.sort_values(by=["process_id", "timestamp"], ascending=True)
    process_duration = df.groupby('process_id')["timestamp"].max() - df.groupby('process_id')["timestamp"].min() 
    process_duration = process_duration.rename('process_duration')
    
    ts_df = df[ts_cols].set_index('process_id')
    
    # define fxn before calling in .agg to make col name more descriptive (in place of <lambda>)
    def last_five_mean(x):
        return x.tail(5).mean()
    
    ts_features_agg_df = ts_df.groupby('process_id').agg(['min', 'max', 'mean', 'std', last_five_mean])
    
    ts_features_df = pd.concat([process_duration, ts_features_agg_df], axis = 1)
    return ts_features_df

In [None]:
def clean_feature_df(df):
    
    new_col_names = []
    for col in df.columns.ravel():
        if isinstance(col, str):
            new_col_names.append(col)
        elif isinstance(col, tuple):
            col_name = "{}_{}".format(col[0], col[1])
            new_col_names.append(col_name)
    df.columns = new_col_names
    
    return df

In [None]:
def create_feature_matrix(df):
    
    prepped_df = prep_full_df(df)
    metadata_df = prep_metadata(prepped_df)
    time_series_df = prep_time_series_features(prepped_df)
    
    dfs_to_concat = [metadata_df, time_series_df]
    
    feature_df = pd.concat(dfs_to_concat, axis=1)
    
    df_to_return = clean_feature_df(feature_df)

    
    return df_to_return

In [None]:
train_features_df = create_feature_matrix(train_df)

In [None]:
indices_to_keep = list(set(train_features_df.index).intersection(set(train_labels_df.index)))

# figure out why 16 indices dropped out of train_features_df
train_labels_df = train_labels_df[train_labels_df.index.isin(indices_to_keep)]

In [None]:
train_features_w_response = train_features_df.join(train_labels_df)
train_features_w_response.head()

In [None]:
plt = sns.violinplot(x = 'pipeline_L3', y = 'final_rinse_total_turbidity_liter', data = train_features_w_response).get_figure()
plt.savefig("output.png")


In [None]:
ax = sns.boxplot(x="num_phases", y="final_rinse_total_turbidity_liter", data=train_features_w_response)

In [None]:
a = pd.crosstab(index=train_features_w_response["num_phases"], columns="count")
#a.loc[True]["count"]
a

In [None]:
(train_features_w_response
 .plot
 .scatter(x='supply_pressure_max', y='final_rinse_total_turbidity_liter')
 .set(title='Title',
      xlabel='supply_pressure_max',
      ylabel='final_rinse_total_turbidity_liter'))

In [None]:
plt = sns.lmplot("supply_pressure_max", "final_rinse_total_turbidity_liter", fit_reg = False, size=8, data=train_features_w_response)
ax = plt.axes.flatten()
title = ax[0].set_title("Title")


In [None]:
sns_plot = sns.distplot(train_features_w_response["turbidity_in_liters_mean"]).get_figure()


In [None]:
#train_features_w_response.columns
for col in train_features_w_response.columns:
    print(col, train_features_w_response[col].dtype)

In [None]:
for col in train_features_w_response.columns:
    if "pipeline" in col:
        train_features_w_response[col] = train_features_w_response[col].astype(bool)

In [None]:
def get_categorical_plot(df, col_name, col_type, response_var):
    
    plot_output_folder = os.path.join(top_level_git_dir, "src", "visualizations")
    col_plot_output_folder = os.path.join(plot_output_folder, col_name)
    
    if not os.path.exists(col_plot_output_folder):
        os.makedirs(col_plot_output_folder)
        
    xtab = pd.crosstab(index=df[col_name], columns="count")
    if col_type == "bool":
        true_ct = xtab.loc[True]["count"] if True in list(xtab.index.values) else None
        false_ct = xtab.loc[False]["count"]
        plot_title = "False: {} / True: {}".format(col_name, false_ct, )
    else:
        plot_title = col_name
    violin_plt = sns.violinplot(x = col_name, y = response_var, data = df).set_title(plot_title).get_figure()
    violin_plt.savefig(os.path.join(col_plot_output_folder, "violin_plot.png"))
    plt.pyplot.close()
    #sns_plot = sns.distplot(train_features_w_response.final_rinse_total_turbidity_liter).get_figure()
    #sns_plot.savefig("output.png")
    return

def get_continuous_plot(df, col_name, response_var):

    plot_output_folder = os.path.join(top_level_git_dir, "src", "visualizations")
    col_plot_output_folder = os.path.join(plot_output_folder, col_name)
    
    if not os.path.exists(col_plot_output_folder):
        os.makedirs(col_plot_output_folder)
    
    scatter_plt = sns.lmplot(x=col_name, y=response_var, fit_reg = False, size=8, data=df)
    scatter_plt_ax = scatter_plt.axes.flatten()
    title = scatter_plt_ax[0].set_title(col_name)
    scatter_plt.savefig(os.path.join(col_plot_output_folder, "scatter_plot.png"))
    plt.pyplot.close()

    dist_plt = sns.distplot(df[col_name]).get_figure()
    dist_plt.savefig(os.path.join(col_plot_output_folder, "density_plot.png"))
    plt.pyplot.close()

    return

def create_eda_plots(df, response_var = 'final_rinse_total_turbidity_liter'):
    
    col_list = df.columns
    col_type_dict = {}
    
    for col in col_list:
        if col == "num_phases":
            continue
        col_type = df[col].dtype
        col_type_dict[col] = col_type
        
    for col, col_type in col_type_dict.items():
        print("-- {}".format(col))    
        if col_type in ["bool", "int64"]:
            
            get_categorical_plot(df, col, col_type, response_var)
        
        elif col_type in ["float64"]:
            
            get_continuous_plot(df, col, response_var)
    
    

In [None]:
create_eda_plots(train_features_w_response)

In [None]:
col_name = "supply_flow_min"
a = sns.distplot(train_features_w_response[col_name])
pyplot.close()

In [None]:
import matplotlib.pyplot as pyplot
fig = pyplot.hist(train_features_w_response[col_name], color = 'blue', edgecolor = 'black', bins=10, label=col_name)
pyplot.title(col_name)
pyplot.savefig("pyplot.png")

In [None]:
train_features_w_response[col_name].max()

In [None]:
train_features_w_response[col_name].min()