In [2]:
import pandas as pd
import numpy as np
import pyodbc

# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline # needed in order to include SMOTE preprocessing step(won't work with regular pipeline)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, PowerTransformer, RobustScaler, KBinsDiscretizer
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, classification_report, precision_recall_curve, roc_auc_score, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

from sklearn.decomposition import PCA
#from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
#from sklearn.naive_bayes import MultinomialNB, GaussianNB


In [3]:
connection = pyodbc.connect('DSN=Teradata_Prod')

sql = """select
    A.ord_fr_supl_id as supl_id,
	A.purch_doc_nbr,
    A.purch_doc_line_nbr,
    A.purch_org_cd, 
    A.supl_mtrl_nbr, 
	P.Last30dayQty,
	--A.supplier_tender_date,
	A.stats_dlvr_dt,
	A.dlvr_dt,
    coalesce(A.purch_doc_dlvr_chg_dt, to_date('2099-12-31')) as purch_doc_dlvr_chg_dt,
    A.dlvr_dt-A.stats_dlvr_dt as dlvr_date_diff,
    A.purch_line_net_usd_amt, 
    A.mtrl_grp_cd,
    A.plnt_nm, 
    A.plnt_id,
    A.strg_loc_id,
    A.mtrl_id,
	A.purch_doc_cre_uid,
	A.purch_grp_cd,
	purch_line_last_rcpt_dt,
	A.purch_line_rcv_sts_cd,
    B.itm_dsc,
	A.shp_cond_cd,
	Cast(A.purch_line_cre_ts as Date) POCreate,
	--A.dlvr_dt - a.prcss_dt as LeadTime,
	Cast(A.purch_line_frst_authr_ts as Date) as purch_line_frst_authr_dt,
	to_char(A.dlvr_dt, 'Month') as MonthDue,
	'Week '||to_char(A.dlvr_dt, 'iw') as WeekDue,
	A.tchls_ind,
	A.cfm_ctrl_cd,
	CASE WHEN C.ChangeCnt > 0 THEN C.ChangeCnt ELSE 0 END as ChangeCnt,
    
	CASE WHEN e.EventCnt > 0 THEN e.EventCnt ELSE 0 End as EventCnt,
    A.purch_doc_line_qty,
    A.purch_line_aprv_tpt_day_cnt,
    case when coalesce(A.purch_doc_dlvr_chg_dt, to_date('2099-12-31')) = '2099-12-31' then 'N' else 'Y' end as dlvr_chg_ind,
    to_char(A.purch_line_cre_dt, 'iw') POCreateWW,
    to_char(A.purch_line_cre_dt, 'Month') as POCreateMM,
    A.high_lvl_src_org_cd,
    A.frst_seq_gl_acct_char_nbr,
    A.purch_line_gds_rcpt_late_ind as IsLate,
    S.ctry_nm as supl_ctry_nm,
	PH.inco_term_cd,
	PH.inco_term_dsc_2 as inco_term_dsc,
	CASE WHEN Carrier.Carrier IS NULL THEN 'UNKNOWN' ELSE Carrier.Carrier END as Carrier
from Procurement_Analysis.v_fact_purch_ord_line A
left join Procurement_Analysis.v_dim_itm B
on B.itm_id=A.mtrl_id

left join Factory_Materials_Analysis.v_dim_fctry_item_strg_loc F
on A.mtrl_id = F.itm_id and A.strg_loc_id = F.strg_loc_cd --and A.ord_fr_supl_id = F.curr_supl_id
left join (
select A.ord_fr_supl_id, A.supl_mtrl_nbr, B.purch_line_frst_authr_dt, sum(purch_doc_line_qty) as Last30dayQty from Procurement_Analysis.v_fact_purch_ord_line A
join (select ord_fr_supl_id, supl_mtrl_nbr,  purch_line_frst_authr_dt 
from Procurement_Analysis.v_fact_purch_ord_line
where purch_line_frst_authr_dt >= trunc(add_months(current_date,-24), 'MM') AND Extract (YEAR From purch_line_frst_authr_dt) NOT IN (9999)) B
on  A.purch_line_frst_authr_dt between B.purch_line_frst_authr_dt - interval '30' day AND B.purch_line_frst_authr_dt
and A.ord_fr_supl_id = B.ord_fr_supl_id
and A.supl_mtrl_nbr = B.supl_mtrl_nbr
where A.purch_doc_nbr like '7%'
and A.strg_loc_id In ('24','26','33','107','167','189','193','194','188','210')
and A.purch_doc_line_del_ind not in ('Y')
and A.purch_line_rcv_sts_cd not in ('NON RECEIV')
group by A.ord_fr_supl_id,  A.supl_mtrl_nbr, B.purch_line_frst_authr_dt) P
on A.ord_fr_supl_id = P.ord_fr_supl_id
and A.purch_line_frst_authr_dt = P.purch_line_frst_authr_dt
and A.supl_mtrl_nbr = P.supl_mtrl_nbr
left join (
	select
	sr.shp_ref_nbr,
	count(e.evnt_cd) as EventCnt,
	min(CAST(utc_bol_evnt_dt AS TIMESTAMP(0)) + (utc_bol_evnt_tm - TIME '00:00:00' HOUR TO SECOND)) as MinEventDate

	from shipment.v_bol_ref_TRK sr
	left join shipment.v_bol_hdr_TRK sh
	on sr.bol_id=sh.bol_id
	left join  shipment.v_bol_evnt_TRK e
	on e.bol_id = sh.bol_id
	where e.evnt_cd in ('HIC','ADQ','CAN')
	and sr.shp_ref_nbr like '7%'
	and length(sr.shp_ref_nbr)=10
	group by sr.shp_ref_nbr
) e
on e.shp_ref_nbr = A.purch_doc_nbr

left join (
	select
	C.purch_doc_nbr PONbr,
	C.purch_doc_line_nbr POLineNbr,
	count(*) ChangeCnt
	from Procurement_Analysis.v_fact_pr_po_chg_mntr C
	 left outer Join   (
	 select *
	 from Procurement_Analysis.v_lkup_pr_po_chg_type
	 where  rpt_type_cd IN ('PO')) T
	ON (C.chg_nm_id=T.chg_nm_id)
	JOIN (
	SELECT * FROM Procurement_Analysis.v_fact_purch_ord_line
	   where  purch_line_frst_authr_dt >= trunc(add_months(current_date,-24), 'MM') AND Extract (YEAR From purch_line_frst_authr_dt) NOT IN (9999)) PO
	ON (PO.purch_doc_nbr = C.purch_doc_nbr and PO.purch_doc_line_nbr = C.purch_doc_line_nbr
	--and C.usr_nm NOT IN ('XI_CAPITAL','WF-BATCH','WF_BATCH','XI_BPMS')
	)
	where C.rpt_type_cd in ('PO') and C.purch_doc_line_nbr<>'*'
	group by C.purch_doc_nbr,C.purch_doc_line_nbr
) C
on (A.purch_doc_nbr = C.PONbr and A.purch_doc_line_nbr = C.POLineNbr)
left join Procurement_Analysis.v_dim_supl_geo as S
on (S.supl_id = A.ord_fr_supl_id)

left join Procurement_Analysis.v_fact_purch_ord_hdr as PH
on (A.purch_doc_nbr = PH.purch_doc_nbr)
left join
(
select s.shp_ref_nbr,TRIM(TRAILING ',' FROM (XMLAGG(trim(s.carr_cd) || ','
        ORDER BY s.carr_cd desc
   ) (VARCHAR(500)))) Carrier
from (
select distinct sr.shp_ref_nbr,sh.carr_cd
from shipment.v_bol_ref_TRK sr
left join shipment.v_bol_hdr_TRK sh
on sr.bol_id=sh.bol_id
left join  shipment.v_bol_evnt_TRK e
on e.bol_id = sh.bol_id
where sr.shp_ref_nbr in
(
select distinct A.purch_doc_nbr
from Procurement_Analysis.v_fact_purch_ord_line A
where A.purch_line_frst_authr_dt >= trunc(add_months(current_date,-24), 'MM') AND Extract (YEAR From A.purch_line_frst_authr_dt) NOT IN (9999)
and A.purch_doc_nbr like '7%'
and A.strg_loc_id In ('24','26','33','107','167','189','193','194','188','210')
and A.purch_doc_line_del_ind not in ('Y')
and A.purch_line_rcv_sts_cd not in ('NON RECEIV')
)
) s
group by s.shp_ref_nbr
) Carrier
on (Carrier.shp_ref_nbr = A.purch_doc_nbr)
where A.purch_line_frst_authr_dt >= trunc(add_months(current_date,-24), 'MM') AND Extract (YEAR From A.purch_line_frst_authr_dt) NOT IN (9999)
and A.purch_doc_nbr like '7%'
and A.strg_loc_id In ('24','26','33','107','167','189','193','194','188','210')
and A.purch_doc_line_del_ind not in ('Y')
and A.purch_line_rcv_sts_cd not in ('NON RECEIV')
and A.purch_line_rcv_sts_cd = 'complete'"""
data = pd.read_sql(sql,connection)

connection.close()

In [4]:
connection = pyodbc.connect('DSN=TDPRD-APPL')

sql1 = """select itm_id, strg_loc_cd, lead_tm_day_cnt,rplnsh_plcy_nm, rplnsh_mthd_nm, rpr_type_nm, crtcl_itm_ind, cnsgn_ind
from Factory_Materials_Analysis.v_dim_fctry_item_strg_loc 
where strg_loc_cd In ('24','26','33','107','167','189','193','194','188','210')"""

data1 = pd.read_sql(sql1,connection)

connection.close()

In [5]:
df = pd.merge(data, data1,  how='inner', left_on=['mtrl_id','strg_loc_id'], right_on = ['itm_id','strg_loc_cd'])

In [6]:
df['IsLate_num'] = np.where(df['IsLate']=='N',0,1)

In [9]:
df = df[(df.Carrier) != 'UNKNOWN']

In [10]:
#Log Transform
df['purch_line_aprv_tpt_day_cnt_log'] = np.log((1+ df['purch_line_aprv_tpt_day_cnt']))
df['Last30dayQty_log'] = np.log((1+ df['Last30dayQty']))
df['purch_line_net_usd_amt_log'] = np.log((1+ df['purch_line_net_usd_amt']))
df['purch_doc_line_qty_log'] = np.log((1+ df['purch_doc_line_qty']))

In [11]:
#BOXCOX Transform
from scipy import stats

qty = np.array(df['Last30dayQty'])
qty_clean = qty[~np.isnan(qty)]
l, opt_lambda = stats.boxcox(qty_clean)
print('Optimal lambda value:', opt_lambda)

df['Last30dayQty_boxcox'] = stats.boxcox(
                                            df['Last30dayQty'], 
                                              lmbda=opt_lambda)

Optimal lambda value: -0.16325146077601782


In [None]:
df['lead_tm_day_cnt'].plot(kind='box')

In [None]:
#df_hist = df_training['PO_Count']

df['lead_tm_day_cnt'].hist(bins=10)


In [12]:
quantile_list = [0, .25, .5, .75, 1.]
quantiles = df['lead_tm_day_cnt'].quantile(quantile_list)

In [13]:
quantile_labels = ['0-25Q', '25-50Q', '50-75Q', '75-100Q']
df['lead_tm_day_cnt_QR'] = pd.qcut(
                                            df['lead_tm_day_cnt'], 
                                            q=quantile_list)
df['lead_tm_day_cnt_QL'] = pd.qcut(
                                            df['lead_tm_day_cnt'], 
                                            q=quantile_list,       
                                            labels=quantile_labels)


In [None]:
df.describe()

#Binning
bin_ranges = [0, 250, 500, 750, 1000, 1250, 1500, 1750, 2000]
bin_names = [1, 2, 3, 4, 5, 6,7,8]

df['Last30dayQty_bin'] = pd.cut(
                                           np.array(
                                              df['Last30dayQty']), 
                                              bins=bin_ranges,            
                                              labels=bin_names)

In [14]:
df.isnull().sum()

supl_id                            0
purch_doc_nbr                      0
purch_doc_line_nbr                 0
purch_org_cd                       0
supl_mtrl_nbr                      0
Last30dayQty                       0
stats_dlvr_dt                      0
dlvr_dt                            0
purch_doc_dlvr_chg_dt              0
dlvr_date_diff                     0
purch_line_net_usd_amt             0
mtrl_grp_cd                        0
plnt_nm                            0
plnt_id                            0
strg_loc_id                        0
mtrl_id                            0
purch_doc_cre_uid                  0
purch_grp_cd                       0
purch_line_last_rcpt_dt            0
purch_line_rcv_sts_cd              0
itm_dsc                            0
shp_cond_cd                        0
POCreate                           0
purch_line_frst_authr_dt           0
MonthDue                           0
WeekDue                            0
tchls_ind                          0
c

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12354 entries, 466 to 176955
Data columns (total 58 columns):
supl_id                            12354 non-null object
purch_doc_nbr                      12354 non-null object
purch_doc_line_nbr                 12354 non-null object
purch_org_cd                       12354 non-null object
supl_mtrl_nbr                      12354 non-null object
Last30dayQty                       12354 non-null float64
stats_dlvr_dt                      12354 non-null object
dlvr_dt                            12354 non-null object
purch_doc_dlvr_chg_dt              12354 non-null object
dlvr_date_diff                     12354 non-null int64
purch_line_net_usd_amt             12354 non-null float64
mtrl_grp_cd                        12354 non-null object
plnt_nm                            12354 non-null object
plnt_id                            12354 non-null object
strg_loc_id                        12354 non-null object
mtrl_id                         

In [35]:
Features = ['supl_id','supl_mtrl_nbr','purch_grp_cd','purch_line_net_usd_amt_log','lead_tm_day_cnt_QL',
             'ChangeCnt', 'strg_loc_id', 'dlvr_chg_ind','Last30dayQty_log', 
'POCreateWW', 'WeekDue','Carrier','purch_line_aprv_tpt_day_cnt_log', 'IsLate_num']
#'purch_org_cd', 'cfm_ctrl_cd',
df_training = df[Features]

'crtcl_itm_ind','rplnsh_plcy_nm', 'rplnsh_mthd_nm', 'rpr_type_nm',

In [36]:

numeric_features = df_training.select_dtypes(include=np.number).drop(['IsLate_num'], axis=1).columns#.tolist()

categorical_features = df_training.select_dtypes(include=['object']).columns


In [18]:
# Use this to determine strength of correlations between numeric features and the dependent variable.

corr = df_training[numeric_features.tolist()+['IsLate_num']]
corr['IsLate_num']=(corr['IsLate_num']==1).astype(int)

correl = corr.corr()

correl.style.background_gradient(cmap='coolwarm')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,purch_line_net_usd_amt_log,ChangeCnt,Last30dayQty_log,purch_line_aprv_tpt_day_cnt_log,IsLate_num
purch_line_net_usd_amt_log,1.0,-0.106568,0.107269,0.0092842,0.0309371
ChangeCnt,-0.106568,1.0,-0.0548088,0.264783,0.185585
Last30dayQty_log,0.107269,-0.0548088,1.0,-0.0672014,-0.124963
purch_line_aprv_tpt_day_cnt_log,0.0092842,0.264783,-0.0672014,1.0,0.23337
IsLate_num,0.0309371,0.185585,-0.124963,0.23337,1.0


In [20]:
df_training.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12354 entries, 466 to 176955
Data columns (total 20 columns):
supl_id                            12354 non-null object
supl_mtrl_nbr                      12354 non-null object
purch_org_cd                       12354 non-null object
purch_line_net_usd_amt_log         12354 non-null float64
lead_tm_day_cnt_QL                 12354 non-null category
ChangeCnt                          12354 non-null int64
strg_loc_id                        12354 non-null object
cnsgn_ind                          12354 non-null object
dlvr_chg_ind                       12354 non-null object
Last30dayQty_log                   12354 non-null float64
supl_ctry_nm                       12354 non-null object
purch_grp_cd                       12354 non-null object
cfm_ctrl_cd                        12354 non-null object
POCreateWW                         12354 non-null object
WeekDue                            12354 non-null object
Carrier                       

In [37]:
X = df_training.drop(['IsLate_num'], axis=1)
y = df_training[['IsLate_num']]

In [38]:

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('pca', PCA(n_components = 2, whiten=True, svd_solver = 'arpack')),
    #('scaler', RobustScaler(with_scaling=True))
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [39]:
# use the ColumnTransformer to apply the transformations to the correct columns in the dataframe


from sklearn.compose import ColumnTransformer
#from future_encoders import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(data[numeric_features])
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
X_1 = data.drop(data[numeric_features], axis=1)
data = pd.concat([principalDf,X_1],axis=1,join_axes=[X_1.index])


In [None]:
#'LeadTime','purch_line_net_usd_amt', 'ChangeCnt','purch_doc_line_qty','dlvr_date_diff', 'purch_line_aprv_tpt_day_cnt',

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=12)

In [41]:
rfc=RandomForestClassifier(n_jobs=-1,verbose=2,criterion='gini',max_features='auto',n_estimators=300)

In [42]:
param_grid = {
    'n_estimators' : [300],
    'max_features' : ['auto']
}

scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score)
}

In [43]:
    skf = StratifiedKFold(n_splits=3)
    grid_search = GridSearchCV(rfc,
                               param_grid, 
                               scoring=scorers,
                               refit='recall_score',
                           cv=skf, n_jobs=-1)
 

In [44]:
rf_pl = Pipeline([
    ('preprocessor', preprocessor),
    #('upsampler', SMOTE(k_neighbors=2)),
    ('sampler', RandomOverSampler(random_state=12)),
    ('model', grid_search)
    ])



In [45]:
rf_pl.fit(X_train, y_train.values.ravel())

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 300building tree 2 of 300

building tree 3 of 300building tree 4 of 300building tree 5 of 300
building tree 6 of 300

building tree 7 of 300building tree 8 of 300


building tree 9 of 300
building tree 10 of 300
building tree 11 of 300
building tree 12 of 300
building tree 13 of 300building tree 14 of 300

building tree 15 of 300
building tree 16 of 300
building tree 17 of 300
building tree 18 of 300
building tree 19 of 300
building tree 20 of 300
building tree 21 of 300
building tree 22 of 300
building tree 23 of 300
building tree 24 of 300
building tree 25 of 300
building tree 26 of 300
building tree 27 of 300
building tree 28 of 300
building tree 29 of 300building tree 30 of 300

building tree 31 of 300building tree 32 of 300

building tree 33 of 300building tree 34 of 300

building tree 35 of 300

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.7s



building tree 36 of 300
building tree 37 of 300
building tree 38 of 300
building tree 39 of 300
building tree 40 of 300
building tree 41 of 300
building tree 42 of 300
building tree 43 of 300
building tree 44 of 300
building tree 45 of 300
building tree 46 of 300
building tree 47 of 300
building tree 48 of 300
building tree 49 of 300
building tree 50 of 300
building tree 51 of 300
building tree 52 of 300
building tree 53 of 300
building tree 54 of 300
building tree 55 of 300
building tree 56 of 300
building tree 57 of 300
building tree 58 of 300
building tree 59 of 300
building tree 60 of 300
building tree 61 of 300
building tree 62 of 300
building tree 63 of 300
building tree 64 of 300
building tree 65 of 300
building tree 66 of 300
building tree 67 of 300
building tree 68 of 300
building tree 69 of 300
building tree 70 of 300
building tree 71 of 300
building tree 72 of 300
building tree 73 of 300
building tree 74 of 300
building tree 75 of 300
building tree 76 of 300
building tree 7

[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    3.5s



building tree 165 of 300
building tree 166 of 300
building tree 167 of 300
building tree 168 of 300
building tree 169 of 300
building tree 170 of 300
building tree 171 of 300
building tree 172 of 300
building tree 173 of 300
building tree 174 of 300
building tree 175 of 300
building tree 176 of 300
building tree 177 of 300
building tree 178 of 300
building tree 179 of 300
building tree 180 of 300
building tree 181 of 300
building tree 182 of 300
building tree 183 of 300
building tree 184 of 300
building tree 185 of 300
building tree 186 of 300
building tree 187 of 300building tree 188 of 300

building tree 189 of 300
building tree 190 of 300
building tree 191 of 300
building tree 192 of 300
building tree 193 of 300
building tree 194 of 300
building tree 195 of 300
building tree 196 of 300
building tree 197 of 300
building tree 198 of 300
building tree 199 of 300
building tree 200 of 300
building tree 201 of 300
building tree 202 of 300
building tree 203 of 300
building tree 204 of 300

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    7.1s finished


Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [46]:

rf_predictions= rf_pl.predict(X_test)


print(confusion_matrix(y_test, rf_predictions))
#print(roc_curve(y_test, rf_predictions))

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.0s finished


[[2397  215]
 [ 287  808]]


In [47]:
y_scores=rf_pl.predict_proba(X_test)[:, 1]

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.0s finished


In [48]:
def adjusted_classes(y_scores, t):
    """
    This function adjusts class predictions based on the prediction threshold (t).
    Will only work for binary classification problems.
    """
    return [1 if y >= t else 0 for y in y_scores]

In [49]:
y_pred_adj = adjusted_classes(y_scores, t=0.44)
print(confusion_matrix(y_test, y_pred_adj))
print(classification_report(y_test, y_pred_adj))

[[2302  310]
 [ 242  853]]
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2612
           1       0.73      0.78      0.76      1095

    accuracy                           0.85      3707
   macro avg       0.82      0.83      0.82      3707
weighted avg       0.85      0.85      0.85      3707



In [None]:
rf_predictProb=rf_pl.predict_proba(X_test)

In [None]:
df1 = pd.DataFrame(X_test)
df1['Target'] = y_test
df1['prob'] = rf_predictProb[:,0]  
df1['predicted'] = rf_predictions
df1['purch_doc_nbr'] = data.purch_doc_nbr
df1['purch_doc_line_nbr'] = data.purch_doc_line_nbr
df1['purch_line_frst_authr_dt'] = data.purch_line_frst_authr_dt
df1['dlvr_dt'] = data.dlvr_dt



In [None]:
X_test.to_csv(r'C:\Users\prathaki\Documents\SCDA\MyWork\AT IDM OTD\atspares_idm_otd_results.csv', index=False)

rf_pl = Pipeline([
    ('preprocessor', preprocessor),
    #('upsampler', SMOTE(k_neighbors=2)),
    #('sampler', RandomOverSampler(random_state=12)),
    ('model',GaussianNB(priors=[0.8,0.2]))
    ])

rf_pl.fit(X_train.to_xarray(), y_train.values.ravel())

from sklearn.ensemble import RandomForestClassifier

rf_pl = Pipeline([
    ('preprocessor', preprocessor),
    #('upsampler', SMOTE(k_neighbors=2)),
    ('sampler', RandomOverSampler(random_state=12)),
    ('RandomForestClassifier', RandomForestClassifier(#class_weight='balanced_subsample',
                                                      verbose=2,
                                                      oob_score = True,
                                                      criterion = 'entropy',
                                                      n_estimators=32,
                                                      n_jobs=-1))
])

rf_pl.fit(X_train, y_train.values.ravel())

In [None]:
import tpot

tpot_config = {
    'sklearn.ensemble.RandomForestClassifier': {
        'criterion':['gini'],
        'oob_score':[True],
        'n_estimators':[32]
    },
}

tpot_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('upsampler', RandomOverSampler(random_state=8)),
    ('TPOTClassifier', tpot.TPOTClassifier(#config_dict='TPOT sparse',
                                           scoring='recall_weighted',
                                           n_jobs=-1,
                                           verbosity=2,
                                           cv=3,
                                           generations=50, # 100 is default
                                           population_size=200, # 100 is default
                                           config_dict=tpot_config
                                          ))
])
    
tpot_pipeline.fit(X_train, y_train.values.ravel())
tpot_predictions= tpot_pipeline.predict(X_test)

print(classification_report(y_test, tpot_predictions))
print(confusion_matrix(y_test, tpot_predictions))

In [None]:
# Use this to compare each feature against the dependent variable, IsLate to visually determine its importance. It shows the proportion of Late to OnTime orders.

props = df_training.groupby("Carrier")['IsLate'].value_counts(normalize=True).unstack()
props.plot(kind='bar', stacked='True', figsize=(18,5))

In [None]:
# Use this to compare each feature against the dependent variable, IsLate to visually determine its importance. It shows the proportion of Late to OnTime orders.

props = df_training.groupby("supl_ctry_nm")['IsLate'].value_counts(normalize=True).unstack()
props.plot(kind='bar', stacked='True', figsize=(18,5))

In [None]:
#df_predictions[(df_predictions['Prediction']=='Late') & (df_predictions.purch_line_rcv_sts_cd.str.strip() =='OPEN')]

In [33]:
from sklearn.feature_selection import RFE
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder
df_encoded = df_training[categorical_features].apply(LabelEncoder().fit_transform)
y_encoded = df_training[['IsLate_num']].apply(LabelEncoder().fit_transform)



# Feature extraction
model = RandomForestClassifier(n_jobs=-1)
#model = LogisticRegression(n_jobs=-1)
rfe = RFE(model, 1)
#fit = rfe.fit(X[numeric_features], y)
fit = rfe.fit(df_encoded, y_encoded)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (categorical_features[fit.support_]).tolist())
print("All Features: %s" % (categorical_features.tolist()))
print("All Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

In [34]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
#Suppose, we select 5 features with top 5 Fisher scores
selector = SelectKBest(chi2, k = 'all')
#selector= SelectKBest(score_func=mutual_info_classif, k='all')
#New dataframe with the selected features for later use in the classifier. fit() method works too,
# if you want only the feature names and their corresponding scores
X_new = selector.fit_transform(df_encoded, y_encoded)
names = df_encoded.columns.values[selector.get_support()]
scores = selector.scores_[selector.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'chi2_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['chi2_Scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted.round(1))

                   Feat_names  chi2_Scores
1               supl_mtrl_nbr       8867.1
0                     supl_id       4101.3
7                purch_grp_cd        863.7
11                    Carrier        689.4
5                dlvr_chg_ind        339.4
10                    WeekDue        206.9
3                 strg_loc_id        195.1
9                  POCreateWW        114.9
2                purch_org_cd         15.7
8                 cfm_ctrl_cd         15.3
13               inco_term_cd          2.2
12  frst_seq_gl_acct_char_nbr          2.1
4                   cnsgn_ind          0.5
6                supl_ctry_nm          0.0


p_values = pd.Series(ns_df['chi2_Scores'],index = X.columns)
p_values.sort_values(ascending = False , inplace = True)
p_values.plot.bar()

In [None]:

# Import train_test_split function
from sklearn.model_selection import train_test_split
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from ipykernel import kernelapp as app

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

RFclf=RandomForestClassifier(n_estimators=100, max_depth=50, class_weight = 'balanced_subsample', bootstrap=False, criterion='gini', max_features='auto', min_impurity_decrease=0.001)

#Train the model using the training sets 
RFclf.fit(X_train,y_train)



In [None]:
y_pred=RFclf.predict(X_test)

In [None]:
print("Accuracy:",metrics.classification_report(y_test, y_pred))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

rf_pipeline = Pipeline([
    ('RandomForestClassifier', RandomForestClassifier(n_estimators=100, max_depth=50, class_weight = 'balanced_subsample', bootstrap=False, criterion='gini', max_features='auto', min_impurity_decrease=0.001))
])

rf_pipeline.fit(X_train, y_train.values.ravel())
rf_predictions= rf_pipeline.predict(X_test)

print(classification_report(y_test, rf_predictions))
print(confusion_matrix(y_test, rf_predictions))
