In [97]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle



In [98]:
class Absent_model():

    def __init__(self, model_file, scaler_file):
        self.reg = pickle.load(open(model_file, 'rb'))
        self.scaler = pickle.load(open(scaler_file, 'rb'))
        self.data = None
    
    def standardized(self, X, y=None, copy=None):
        scaler_col = self.scaler.get_feature_names_out()
        X_scaled = pd.DataFrame(self.scaler.fit_transform(X[scaler_col]), columns=scaler_col)
        X_not_scaled = X.loc[:,~X.columns.isin(scaler_col)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[self.init_col_order]
 
    def load_and_clean_data(self, file_name):
        raw = pd.read_csv(file_name,delimiter=',')

        raw["reason_group"] = pd.cut(
            x=raw["Reason for Absence"],
            bins=[-np.inf, 0, 14, 17, 21, np.inf],
            labels=["unknown", "sickness", "pregnancy", "accident", "other"],

        )
        
        reason_dummy = pd.get_dummies(raw["reason_group"], prefix="reason_group", drop_first=True)
        df = pd.concat([raw, reason_dummy], axis=1)
        df['timestamp'] = pd.to_datetime(df['Date'], dayfirst=True)
        df['month'] = df['timestamp'].dt.month
        df['day_of_week'] = df['timestamp'].dt.day_of_week
        df['higher_education'] = df['Education'].apply(lambda x: 1 if x > 1 else 0)

        #spalten die wir nicht brauchen
        df.drop(columns=['Date', 'ID', 'Reason for Absence', 'reason_group', 'timestamp', 'Education'], inplace=True)
        #Rückwärtseliminierung
        df.drop(columns=['month', 'Distance to Work', 'Daily Work Load Average'], inplace=True)
        self.init_col_order = df.columns

        
        self.data = self.standardized(df)

    def data_existence(self):
        if self.data is None:
            raise ValueError("Data not loaded. Please load data first.")
        else: 
            print("Data loaded successfully.")
            return True

    def predict_probability(self):
        if self.data_existence():
            pred = self.reg.predict_proba(self.data[self.init_col_order])[:,1]
            return pred

    def predict_output(self):
        if self.data_existence():
            pred = self.reg.predict(self.data[self.init_col_order])
            return pred
        
    def adding_predicted_to_df(self):
        self.data['Probability'] = self.predict_probability()
        self.data ['Prediction'] = self.predict_output()
        return self.data       

Absent_model = Absent_model("model.sav", "scaler.sav")
  
Absent_model.load_and_clean_data('Absenteeism_new_data.csv')


In [99]:
Absent_model.adding_predicted_to_df()
Absent_model.data.head()

Data loaded successfully.
Data loaded successfully.


Unnamed: 0,Transportation Expense,Age,Body Mass Index,Children,Pets,reason_group_sickness,reason_group_pregnancy,reason_group_accident,reason_group_other,day_of_week,higher_education,Probability,Prediction
0,-0.326675,-0.877958,-1.419428,-1.187282,-0.660387,0,0,0,1,1.346874,1,0.150477,0
1,2.09758,-1.12791,0.294598,0.030443,0.827808,1,0,0,0,-1.488651,0,0.935596,1
2,-0.646357,-0.378053,-0.133908,1.248168,-0.660387,0,0,0,1,-0.070888,0,0.415703,0
3,-0.326675,0.371804,-0.776668,1.248168,-0.660387,0,0,0,1,1.346874,1,0.354225,0
4,-0.646357,-0.378053,-0.133908,1.248168,-0.660387,1,0,0,0,1.346874,0,0.754528,1
