In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import scsavailability as scs
    
import statsmodels.api as sm
from scipy import stats

from scsavailability import features as feat, model as md, plotting as pt, score as sc

In [34]:
path = '../raw_data/'

In [35]:
at_raw = pd.read_csv(path+'active_totes_20201210.csv')
av = pd.read_csv(path+'Availability_with_Grey&Blue_1811-0912.csv',names = ["timestamp","Pick Station","Availability","Blue Tote Loss","Grey Tote Loss"])
scs_raw = pd.read_csv(path + 'Faults20_11-10_12.csv')

In [36]:
# at_raw = pd.read_csv(path+'active_totes_20201123.csv')
# av = pd.read_csv(path+'Availability_with_Grey&Blue_1811-2511.csv')
# #fa = pd.read_csv(path + 'scs_tote_matched.csv')
# scs_raw = pd.read_csv('../raw_data/SCS alerts Nov.csv')

In [37]:
# # HOTFIX - remove odd day
# at_raw = at_raw[~at_raw['DAY'].isin([9])]

In [38]:
fa = feat.add_code(scs_raw)
fa, unmapped = feat.add_tote_colour(fa)

Running: Function "load_tote_lookup" (id=2520145595120) was called 4 times


In [39]:
at = feat.pre_process_AT(at_raw)

Running: Function "pre_process_AT" (id=2520145595984) was called 3 times


In [40]:
av = feat.pre_process_av(av)

Running: Function "pre_process_av" (id=2520145596272) was called 2 times


In [41]:
fa = feat.preprocess_faults(fa,remove_same_location_faults = True)

Running: Function "preprocess_faults" (id=2520145594544) was called 2 times
Running: Function "load_module_lookup" (id=2520145593248) was called 2 times
Running: Function "load_ID_lookup" (id=2520145593968) was called 2 times
duplicated location faults removed - max duration kept
HOTFIX: Quadrant only faults, PTT Asset Code update


In [42]:
fa_floor = feat.floor_shift_time_fa(fa, shift=0)

Time shifted by 0Minutes


In [43]:
def run_m(module, fa_floor, av, at):
    """
    Summary
    -------
    Runs model-level linear model and outputs the "significant" asset codes and their coefficients.
    1. Select Module-related faults
    2. Filter out rare asset codes
    3. Fit Model
    4. Remove high p-values
    5. Re-fit model
    6. Report negative coefficients
    ----------
    module: int
        module number
    fa_floor: pandas DataFrame
        formatted faults data
    av: pandas DataFrame
        formatted availability data
    at: pandas DataFrame
        formatted active totes data
    Returns
    -------
    df_r: pandas DataFrame
        significant coefficients
    Example
    --------
    df_r = run_m(2, fa_floor, av, at)
    """
    
    mod = [str(module)]
    fa_sel_1 = feat.get_data_faults(fa_floor, modules = mod)
    limit = 0
    most_common = list(((fa_sel_1['Asset Code'].value_counts() > limit)[fa_sel_1['Asset Code'].value_counts() > limit]).index)
    fa_sel = fa_sel_1[fa_sel_1['Asset Code'].isin(most_common)]


    fa_agg = feat.faults_aggregate(fa_sel, fault_agg_level= 'Asset Code', agg_type = 'count')

    av_sel, at_sel = feat.av_at_select(av, at, remove_high_AT = True, availability_select_options = {"Module" : mod})

    av_agg = feat.aggregate_availability(av_sel, agg_level = 'Module')
    at_agg = feat.aggregate_totes(at_sel, agg_level = 'Module')

    df = feat.merge_av_fa_at(av_agg ,at_df=at_agg, fa_df = fa_agg, agg_level = 'None')
    X,y = md.gen_feat_var(df)
    X = X.drop('Module', axis=1)
    # Flip to availability
    y=1-y

    X_train, X_test, y_train, y_test = md.split(X,y)
    Linear_mdl,predictions_LM,Coeff,fit_metrics = md.run_LR_model(X_train, X_test, y_train, y_test)
    cv_R2 = md.cross_validate_r2(Linear_mdl, X, y, n_folds = 5, shuffle = True, random_state = 101)    
    
    model = sm.OLS(y,X)
    results = model.fit()
    keep_features = results.pvalues[results.pvalues < 0.10].index
    model = sm.OLS(y,X[keep_features])
    results = model.fit()
    negs = results.params[results.params < 0]
    df_r = pd.DataFrame(negs, columns=['Coefficient']).reset_index()
    df_r['Module'] = module
    return df_r

In [44]:
a = run_m(1, fa_floor, av, at)

Running: Function "load_PTT_lookup" (id=2520145594832) was called 3 times
Running: Function "aggregate_availability" (id=2520145594976) was called 2 times

Cross Validation Scores LinearRegression(): 
 
          R2 Scores
1    -3.260474e-01
2    -1.139965e+00
3    -8.046769e+20
4     1.345740e-01
5    -1.041974e+00
Mean -1.609354e+20
STD   3.218708e+20


In [45]:
full_df = pd.DataFrame(columns=['index','Coefficient', 'Module'])
for i in range(1, 2):
    try:
        a = run_m(i, fa_floor, av, at)
        full_df = full_df.append(a)
    except:
        print(f'Missing Module: {0}'.format(i))

Running: Function "load_PTT_lookup" (id=2520145594832) was called 4 times
Running: Function "aggregate_availability" (id=2520145594976) was called 3 times

Cross Validation Scores LinearRegression(): 
 
          R2 Scores
1    -3.260474e-01
2    -1.139965e+00
3    -8.046769e+20
4     1.345740e-01
5    -1.041974e+00
Mean -1.609354e+20
STD   3.218708e+20


In [46]:
full_df

Unnamed: 0,index,Coefficient,Module
0,C0502STA030,-0.105686,1
1,C1603RDC143,-0.097833,1
2,C2303ACH218,-0.06341,1


### Take last day of faults

In [None]:
fa.groupby('MODULE')['Fault ID'].nunique().reset_index().head()

In [None]:
fa_recent = fa[(fa['timestamp'].dt.day == fa['timestamp'].dt.day.max())]

In [None]:
fa_recent[fa_recent['Asset Code'].isin(full_df['index'])]

In [None]:
fa_recent[fa_recent['Asset Code'].isin(full_df['index'])]['Fault ID'].value_counts()

In [None]:
fa_recent[fa_recent['Asset Code'].isin(full_df['index'])]['Duration'].hist()