In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import scsavailability as scs
    
import statsmodels.api as sm
from scipy import stats

from scsavailability import features as feat, model as md, plotting as pt, score as sc

In [2]:
path = 'C:/Users/Jamie.williams/OneDrive - Newton Europe Ltd/Castle Donnington/Data/'

In [3]:
at = pd.read_csv(path+'active_totes_20201210.csv')
av = pd.read_csv(path+'Availability_with_Grey&Blue_1811-0912.csv',names = ["timestamp","Pick Station","Availability","Blue Tote Loss","Grey Tote Loss"])
#fa = pd.read_csv(path + 'scs_tote_matched.csv')
scs_raw = pd.read_csv(path + 'Faults20_11-10_12.csv')

In [4]:
fa = feat.add_code(scs_raw)
fa, unmapped = feat.add_tote_colour(fa)

Running: Function "load_tote_lookup" (id=1898467720928) was called 1 times


2021-01-06 18:07:17,129 - NumExpr defaulting to 8 threads.


In [5]:
at = feat.pre_process_AT(at_raw)

NameError: name 'at_raw' is not defined

In [None]:
av = feat.pre_process_av(av)

In [None]:
fa = feat.preprocess_faults(fa,remove_same_location_faults = True)

In [None]:
fa_floor = feat.floor_shift_time_fa(fa, shift=20)
fa_sel = feat.fault_select(fa_floor, fault_select_options = {'Tote Colour':['Both','Blue','Grey']},duration_thres = 1)

In [None]:
def run_m(module, fa_floor, av, at):
    """
    Summary
    -------
    Runs model-level linear model and outputs the "significant" asset codes and their coefficients.
    1. Select Module-related faults
    2. Filter out rare asset codes
    3. Fit Model
    4. Remove high p-values
    5. Re-fit model
    6. Report negative coefficients
    ----------
    module: int
        module number
    fa_floor: pandas DataFrame
        formatted faults data
    av: pandas DataFrame
        formatted availability data
    at: pandas DataFrame
        formatted active totes data
    Returns
    -------
    df_r: pandas DataFrame
        significant coefficients
    Example
    --------
    df_r = run_m(2, fa_floor, av, at)
    """
    
    mod = [str(module)]
    
    
    fa_sel_1 = feat.get_data_faults(fa_floor, modules = mod)
    limit = 250
    most_common = list(((fa_sel_1['Asset Code'].value_counts() > limit)[fa_sel_1['Asset Code'].value_counts() > limit]).index)
    fa_sel = fa_sel_1[fa_sel_1['Asset Code'].isin(most_common)]


    fa_agg = feat.faults_aggregate(fa_sel, fault_agg_level= 'Asset Code', agg_type = 'count')

    av_sel, at_sel = feat.av_at_select(av, at, remove_high_AT = True, availability_select_options = {"Module" : mod})

    av_agg = feat.aggregate_availability(av_sel, agg_level = 'Module')
    at_agg = feat.aggregate_totes(at_sel, agg_level = 'Module')

    df = feat.merge_av_fa_at(av_agg ,at_df=at_agg, fa_df = fa_agg, agg_level = 'None')
    X,y = md.gen_feat_var(df, features = ['Totes','Faults'])
    #X = X.drop('Module', axis=1)
    # Flip to availability
    y=1-y
  

    X_train, X_test, y_train, y_test = md.split(X,y)
    Linear_mdl,predictions_LM,Coeff,fit_metrics = md.run_LR_model(X_train, X_test, y_train, y_test)
    cv_R2 = md.cross_validate_r2(Linear_mdl, X, y, n_folds = 5, shuffle = True, random_state = 101)    
    
    model = sm.OLS(y,X)
    results = model.fit()
    keep_features = results.pvalues[results.pvalues < 0.10].index
    model = sm.OLS(y,X[keep_features])
    results = model.fit()
    negs = results.params[results.params < 0]
    df_r = pd.DataFrame(negs, columns=['Coefficient']).reset_index()
    df_r['Module'] = module
    return df_r

In [None]:
a = run_m(1, fa_sel, av, at)

In [None]:
full_df = pd.DataFrame(columns=['index','Coefficient', 'Module'])
for i in range(1, 21):
    try:
        a = run_m(i, fa_sel, av, at)
        full_df = full_df.append(a)
        print(i)
    except:
        print(f'Missing Module: {0}'.format(i))

In [None]:
full_df

### Take last day of faults

In [None]:
fa.groupby('MODULE')['Fault ID'].nunique().reset_index().head()

In [None]:
fa_recent = fa[(fa['timestamp'].dt.day == fa['timestamp'].dt.day.max())]

In [None]:
fa_recent[fa_recent['Asset Code'].isin(full_df['index'])]

In [None]:
fa_recent[fa_recent['Asset Code'].isin(full_df['index'])]['Fault ID'].value_counts()

In [None]:
fa_recent[fa_recent['Asset Code'].isin(full_df['index'])]['Duration'].hist()