In [1]:


import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std
        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        """
        Fit the scaler to the data.
        
        Parameters:
        X (pd.DataFrame): The data to fit.
        y (optional): Ignored.
        
        Returns:
        self: The fitted scaler instance.
        """
        self.scaler.fit(X[self.columns])
        self.mean_ = np.array(self.scaler.mean_)
        self.var_ = np.array(self.scaler.var_)
        return self

    def transform(self, X, copy=None):
        """
        Transform the data using the fitted scaler.
        
        Parameters:
        X (pd.DataFrame): The data to transform.
        copy (optional): Ignored.
        
        Returns:
        pd.DataFrame: The scaled data.
        """
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns, index=X.index)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

    def get_params(self, deep=True):
        """
        Get parameters for this estimator.
        
        Parameters:
        deep (bool, optional): Ignored.
        
        Returns:
        dict: Parameter names mapped to their values.
        """
        return {
            'columns': self.columns,
            'copy': self.copy,
            'with_mean': self.with_mean,
            'with_std': self.with_std
        }

    def set_params(self, **params):
        """
        Set the parameters of this estimator.
        
        Parameters:
        **params: Parameter names mapped to their values.
        
        Returns:
        self: The estimator instance.
        """
        for param, value in params.items():
            setattr(self, param, value)
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, with_std=self.with_std)
        return self

class absenteeism_model:
    def __init__(self, model_file, scaler_file):
        try:
            with open(model_file, 'rb') as f_model, open(scaler_file, 'rb') as f_scaler:
                self.reg = pickle.load(f_model)
                self.scaler = pickle.load(f_scaler)
                self.data = None
        except (EOFError, pickle.UnpicklingError) as e:
            print(f"Error loading pickle file: {e}")
            self.reg = None
            self.scaler = None

    def load_and_clean_data(self, data_file):
        df = pd.read_csv(data_file, delimiter=',')
        self.df_with_predictions = df.copy()
        df = df.drop(['ID'], axis=1)
        df['Absenteeism Time in Hours'] = 'NaN'

        reason_columns = pd.get_dummies(df['Reason for Absence'], drop_first=True)
        reason_type_1 = reason_columns.loc[:, 1:14].max(axis=1)
        reason_type_2 = reason_columns.loc[:, 15:17].max(axis=1)
        reason_type_3 = reason_columns.loc[:, 18:21].max(axis=1)
        reason_type_4 = reason_columns.loc[:, 22:].max(axis=1)

        df = df.drop(['Reason for Absence'], axis=1)
        df = pd.concat([df, reason_type_1, reason_type_2, reason_type_3, reason_type_4], axis=1)
        column_names = ['Date', 'Transportation Expense', 'Distance to Work', 'Age',
                        'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
                        'Pets', 'Absenteeism Time in Hours', 'r1', 'r2', 'r3', 'r4']
        df.columns = column_names

        column_names_reordered = ['r1', 'r2', 'r3', 'r4', 'Date', 'Transportation Expense',
                                  'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
                                  'Children', 'Pets', 'Absenteeism Time in Hours']
        df = df[column_names_reordered]

        df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
        df['Mv'] = df['Date'].apply(lambda x: x.month)
        df['dotw'] = df['Date'].apply(lambda x: x.weekday())
        df = df.drop(['Date'], axis=1)
        column_names_upd = ['r1', 'r2', 'r3', 'r4', 'Mv', 'dotw',
                            'Transportation Expense', 'Distance to Work', 'Age',
                            'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
                            'Pets', 'Absenteeism Time in Hours']
        df = df[column_names_upd]

        df['Education'] = df['Education'].map({1: 0, 2: 1, 3: 1, 4: 1})
        df = df.fillna(value=0)
        df = df.drop(['Absenteeism Time in Hours'], axis=1)
        df = df.drop(['dotw', 'Daily Work Load Average', 'Distance to Work'], axis=1)

        self.preprocessed_data = df.copy()
        self.data = self.scaler.transform(df)

    def predicted_probability(self):
        if self.data is not None:
            pred = self.reg.predict_proba(self.data)[:, 1]
            return pred

    def predicted_output_category(self):
        if self.data is not None:
            pred_outputs = self.reg.predict(self.data)
            return pred_outputs

    def predicted_outputs(self):
        if self.data is not None:
            self.preprocessed_data['Probability'] = self.reg.predict_proba(self.data)[:, 1]
            self.preprocessed_data['Prediction'] = self.reg.predict(self.data)
            return self.preprocessed_data


In [2]:
pd.read_csv('Absenteeism_new_data.csv')

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,22,27,01/06/2018,179,26,30,237.656,19,3,0,0
1,10,7,04/06/2018,361,52,28,237.656,27,1,1,4
2,14,23,06/06/2018,155,12,34,237.656,25,1,2,0
3,17,25,08/06/2018,179,22,40,237.656,22,2,2,0
4,14,10,08/06/2018,155,12,34,237.656,25,1,2,0
5,28,11,11/06/2018,225,26,28,237.656,24,1,1,2
6,16,7,13/06/2018,118,15,46,275.089,25,1,2,0
7,22,27,13/06/2018,179,26,30,275.089,19,3,0,0
8,34,26,15/06/2018,118,10,37,275.089,28,1,0,0
9,34,10,20/06/2018,118,10,37,275.089,28,1,0,0


In [3]:
model = absenteeism_model('model','scaler')

In [4]:
model.load_and_clean_data('Absenteeism_new_data.csv')

