In [None]:
# import all the libraries used
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

# the custom scaler class
class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self,X,y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self,X,y=None,copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled],axis=1)[init_col_order]
    
def absenteeism_model():
    
    def __init__(self,model_file,scaler_file):
        #read the model and scalar files which were already saved
        with open('model','rb') as model_file, open('scalar','rb') as scalar_file:
            self.reg = pickle.load(model_file)
            self.scalar = pickle.load(scalar_file)
            self.data = None
            
    def load_and_clean_data(self,data_file):
        
        df = pd.read_csv(data_file)
        self.df_with_pred = df.copy()
        df = df.drop(['ID'],axis=1)
        df['Absenteeism Time in Hours'] = 'NaN'
        
        reason_cols = pd.get_dummies(df['Reason for Absence'], drop_first=True)
        
        reason_type_1 = reason_cols.loc[:,1:14].max(axis=1)
        reason_type_2 = reason_cols.loc[:,15:17].max(axis=1)
        reason_type_3 = reason_cols.loc[:,18:21].max(axis=1)
        reason_type_4 = reason_cols.loc[:,22:].max(axis=1)
        
        df = df.drop(['Reason for Absence'],axis=1)
        
        df = pd.concat([df,reason_type_1,reason_type_2,reason_type_3,reason_type_4], axis=1)
        df.rename(columns={0:'Reason_1',1:'Reason_2',2:'Reason_3',3:'Reason_4'},inplace=True)
        
        new_cols = ['Reason_1', 'Reason_2', 'Reason_3',
       'Reason_4' ,'Date', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
       'Pets', 'Absenteeism Time in Hours']
        
        df = df[new_cols]
        
        df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
        
        df['Month'] = df['Date'].apply(lambda d : d.month)
        df['Day Of Week'] = df['Date'].apply(lambda d : d.weekday())
        
        df = df.drop(['Date'],axis=1)
        
        df['Education'] = df['Education'].map({1:0,2:1,3:1,4:1})
        
        df = df.fillna(value=0)
        
        df = df.drop(['Absenteeism Time in Hours', 'Distance to Work', 'Daily Work Load Average', 'Day Of Week'],axis=1)
        
        self.preprocessed_data = df.copy()
        
        self.data = self.scalar.transform(df)
    
    # a function which outputs the probability of a data point to be 1
    def pred_prob(self):
        if(self.data is not None):
            pred = self.reg.predict_proba(self.data)
            return pred
        
    # a function which outputs 0 or 1 based on our model
    def pred_output_category(self):
        if(self.data is not None):
            pred_outputs = self.reg.predict(self.data)
            return pred_outputs
        
    # predicts the outputs and the probabilities and
    # add columns with these values at the end of the data
    def predicted_outputs(self):
        if(self.data is not None):
            self.preprocessed_data['Probability'] = self.reg.predict_proba(self.data)[:,1]
            self.preprocessed_data['Prediction'] = self.reg.predict(self.data)
            return self.preprocessed_data