In [2]:
import pipeline as ppl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [3]:
from datetime import datetime

In [4]:
evaluations = pd.read_csv("../Data/RCRA_EVALUATIONS.csv")

In [5]:
meta = ppl.generate_metadata(evaluations)
summary = ppl.generate_summary(evaluations)
print("evaluations", "\n", "There are", len(evaluations), "observations of", len(meta), "variables:\n")
print(meta, "\n")
print("Summary of numeric variables\n", summary)

evaluations 
 There are 991196 observations of 8 variables:

                 colname           type  pct_null
0              ID_NUMBER  <class 'str'>       0.0
1      ACTIVITY_LOCATION  <class 'str'>       0.0
2  EVALUATION_IDENTIFIER  <class 'str'>       0.0
3        EVALUATION_TYPE  <class 'str'>       0.0
4        EVALUATION_DESC  <class 'str'>       0.0
5      EVALUATION_AGENCY  <class 'str'>       0.0
6  EVALUATION_START_DATE  <class 'str'>       0.0
7        FOUND_VIOLATION  <class 'str'>       0.0 

Summary of numeric variables
 None


In [6]:
evaluations['ACTIVITY_LOCATION'].value_counts()

NJ    72342
FL    64264
NC    55070
CO    52943
PA    47917
KY    46963
OH    40566
NY    39556
MI    36534
TN    34782
GA    32370
CA    30782
IL    29827
IN    25694
AL    24155
MA    22560
SC    18382
WA    18341
MO    18289
KS    17868
LA    17814
TX    17799
IA    16630
WV    16156
VA    16004
WI    15310
NV    14001
MN    13196
MT    12422
CT    11275
OR     9568
MD     9498
DC     9492
PR     9319
MS     6772
WY     5789
RI     5766
OK     5724
AR     5576
UT     5215
NM     4691
AZ     4628
NE     4494
ID     4207
VT     3853
DE     2932
ND     2492
HI     2207
SD     2094
AK     2016
NH     1955
ME     1893
GU      560
VI      404
NN      115
MP       80
TT       31
AS       13
Name: ACTIVITY_LOCATION, dtype: int64

In [7]:
evaluations['EVALUATION_IDENTIFIER'].value_counts()

001    455707
000    136739
002     52842
003     25755
CEN     20156
004     17074
CEI     16580
005     12837
NRR     12074
SFQ     11353
006     10178
MD1      9027
007      8508
601      7329
008      7155
009      6079
010      5478
1        5371
100      5156
011      4699
FRR      4415
GM1      4319
012      4120
013      3643
CNV      3427
014      3337
2        3268
200      3039
CV3      3018
015      2992
        ...  
$09         1
722         1
72A         1
$23         1
758         1
748         1
PK8         1
50B         1
PK6         1
SBY         1
762         1
PKD         1
746         1
$26         1
71B         1
PK9         1
W00         1
$35         1
CAO         1
96C         1
AT          1
PKQ         1
BSF         1
723         1
04J         1
Y28         1
87          1
06A         1
759         1
$16         1
Name: EVALUATION_IDENTIFIER, Length: 1272, dtype: int64

In [8]:
evaluations['EVALUATION_TYPE'].value_counts()

CEI    531722
FCI    115662
NRR    106236
CSE     57794
FRR     41473
FSD     34196
CAV     29405
SNY     18543
SNN     17079
FUI     15623
CDI      8696
GME      6182
OAM      5031
CAC      3487
NIR        67
Name: EVALUATION_TYPE, dtype: int64

In [9]:
evaluations['EVALUATION_DESC'].value_counts()

COMPLIANCE EVALUATION INSPECTION ON-SITE    531722
FOCUSED COMPLIANCE INSPECTION               115662
NON-FINANCIAL RECORD REVIEW                 106236
COMPLIANCE SCHEDULE EVALUATION               57794
FINANCIAL RECORD REVIEW                      41473
COMPLIANCE ASSISTANCE VISIT                  29405
FACILITY SELF DISCLOSURE                     25647
SIGNIFICANT NON-COMPLIER                     18543
NOT A SIGNIFICANT NON-COMPLIER               17079
FOLLOW-UP INSPECTION                         15623
CASE DEVELOPMENT INSPECTION                   8696
Facility Self Disclosure                      8549
GROUNDWATER MONITORING EVALUATION             6182
OPERATION AND MAINTENANCE INSPECTION          5031
CORRECTIVE ACTION COMPLIANCE EVALUATION       3487
NO 3007 INFORMATION REQUEST RECEIVED            67
Name: EVALUATION_DESC, dtype: int64

In [None]:
evaluations['EVALUATION_AGENCY'].value_counts()

S      903249
E       55272
C       14725
B       11863
X        5540
L         401
T         136
N          10
Name: EVALUATION_AGENCY, dtype: int64

In [None]:
evaluations['EVALUATION_START_DATE'].head()

0    02/01/1990
1    09/28/1990
2    03/06/1991
3    03/06/1991
4    09/30/1991
Name: EVALUATION_START_DATE, dtype: object

In [None]:
evaluations['FOUND_VIOLATION'] = evaluations['FOUND_VIOLATION'].str.strip()
evaluations['ACTIVITY_LOCATION'] = evaluations['ACTIVITY_LOCATION'].str.strip()
evaluations['EVALUATION_IDENTIFIER'] = evaluations['EVALUATION_IDENTIFIER'].str.strip()
evaluations['EVALUATION_TYPE'] = evaluations['EVALUATION_TYPE'].str.strip()
evaluations['EVALUATION_AGENCY'] = evaluations['EVALUATION_AGENCY'].str.strip()

In [None]:
evaluations['VIOLATION_IND'] = np.where(evaluations['FOUND_VIOLATION']=="Y",1,0)

In [None]:
evaluations['DATE'] = pd.to_datetime(evaluations['EVALUATION_START_DATE'], errors='coerce')

In [None]:
evaluations_date = evaluations[evaluations['DATE']>='2000-01-01']

In [None]:
evaluations_date.head()

In [None]:
x = evaluations_date[['ACTIVITY_LOCATION','EVALUATION_IDENTIFIER','EVALUATION_TYPE','EVALUATION_AGENCY']]
x_dummy = ppl.dummify_categorical(x, ['ACTIVITY_LOCATION', 'EVALUATION_IDENTIFIER', 'EVALUATION_TYPE', 'EVALUATION_AGENCY'])
y = evaluations_date['VIOLATION_IND']

In [None]:
for depth in range(1,6):
    dtree = ppl.train_decision_tree(x_dummy, y, max_depth=depth)
    x_dummy.columns
    d = {'Features':x_dummy.columns, "Importance":dtree.feature_importances_}
    feature_importance = pd.DataFrame(data=d)
    feature_importance = feature_importance.sort_values(by=['Importance'], ascending=False)
    feature_importance = feature_importance[feature_importance['Importance'] > 0.0]
    print("Feature importance for decision tree with max_depth", depth, ":\n")
    print(feature_importance)
    print("--------------------------------------------------------")

In [None]:
x_train, x_test, y_train, y_test = ppl.create_date_splits(evaluations_date, ['ACTIVITY_LOCATION','EVALUATION_IDENTIFIER','EVALUATION_TYPE','EVALUATION_AGENCY'],
                                                         'VIOLATION_IND', 'DATE', ('01/01/2000','12/31/2015'), ('01/01/2016','12/31/2018'))

In [None]:
x_train = ppl.dummify_categorical(x_train, ['ACTIVITY_LOCATION', 'EVALUATION_IDENTIFIER', 'EVALUATION_TYPE', 'EVALUATION_AGENCY'])
x_test = ppl.dummify_categorical(x_test, ['ACTIVITY_LOCATION', 'EVALUATION_IDENTIFIER', 'EVALUATION_TYPE', 'EVALUATION_AGENCY'])

In [37]:
dtree_summary_1 = ppl.decision_tree_loop(x_train, y_train, x_test, y_test, '01/01/2000-12/31/2015', '01/01/2016-12/31/2018', 
                                         ['gini','entropy'], ['best','random'], [1,2,3,5,10,None], [0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5])

all_dtree = dtree_summary_1.sort_values(by='auc', axis=0, ascending=False)
top_dtree = all_dtree[:10]
print(top_dtree)

AttributeError: module 'pipeline' has no attribute 'decision_tree_loop'