In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle





In [34]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy,with_mean,with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    def transform(self, X, y=None, copy=None):
        print(X.columns)
        print(self.scaler.columns)
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
    
class Absent_model():

    def __init__(self, model_file, scaler_file):
        self.reg = pickle.load(open(model_file, 'rb'))
        self.scaler = pickle.load(open(scaler_file, 'rb'))
        self.data = None
    

    def load_and_clean_data(self, file_name):
        raw = pd.read_csv(file_name,delimiter=',')

        raw["reason_group"] = pd.cut(
            x=raw["Reason for Absence"],
            bins=[-np.inf, 0, 14, 17, 21, np.inf],
            labels=["unknown", "sickness", "pregnancy", "accident", "other"],

        )
        
        reason_dummy = pd.get_dummies(raw["reason_group"], prefix="reason_group", drop_first=True)
        df = pd.concat([raw, reason_dummy], axis=1)
        df['timestamp'] = pd.to_datetime(df['Date'], dayfirst=True)
        df['month'] = df['timestamp'].dt.month
        df['day_of_week'] = df['timestamp'].dt.day_of_week
        df['higher_education'] = df['Education'].apply(lambda x: 1 if x > 1 else 0)

        #target Einstufung
        # der vorteil, der median zu benutyen ist, dass die Daten ausgleich geteilt werden 
        benchmark = df['Absenteeism Time in Hours'].median()
        df['target'] = df['Absenteeism Time in Hours'].apply(lambda x: 1 if x > benchmark else 0)
        #spalten die wir nicht brauchen
        df.drop(columns=['Date', 'ID', 'Reason for Absence', 'Absenteeism Time in Hours', 'reason_group', 'timestamp', 'Education'], inplace=True)
        #Rückwärtseliminierung
        df.drop(columns=['month', 'Distance to Work', 'Daily Work Load Average'], inplace=True)
        
        
        self.preprocessed_data = df

        self.data = self.scaler.transform(df)

    def data_existence(self):
        if self.data is None:
            raise ValueError("Data not loaded. Please load data first.")
        else: 
            print("Data loaded successfully.")
            return True

    def predict_probability(self):
        if self.data_existence():
            pred = self.reg.predict_proba(self.data)[:,1]
            return pred

    def predict_output(self):
        if self.data_existence():
            pred = self.reg.predict(self.data)[:,1]
            return pred
        
    def adding_predicted_to_df(self):
        if self.data_existence():
            self.data['Probability'] = self.predict_probability()
            self.data ['Prediction'] = self.predict_output()
            return self.data       

Absent_model = Absent_model("model.sav", "scaler.sav")
Absent_model.load_and_clean_data('Absenteeism_data.csv')


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- higher_education
- reason_group_accident
- reason_group_other
- reason_group_pregnancy
- reason_group_sickness
- ...


In [None]:
Absent_model.predict_probability()


In [None]:
def stadardize_data(df, column_exclude):

input = df.iloc[:, :-1]

dummy_column =['reason_group_sickness', 'reason_group_pregnancy',
       'reason_group_accident', 'reason_group_other', 'higher_education']
column_scaled = [x for x in input.columns if x not in dummy_column]


In [None]:
#die Daten satnardisieren
#dummy Spalten sollte  nicht skaliert werden, weil sie nicht interpretiert werden können
#normalerweise werden die Daten skaliert, vor dummy erstellung
scaler = StandardScaler()

input[column_scaled] = scaler.fit_transform(input[column_scaled])

In [None]:
#daten in train und test aufteilen und mischen
#standardmäßig schuffle = True
#stratify = True, damit die Verteilung der Zielvariable gleich bleibt
#stratofy link https://stackoverflow.com/questions/34842405/parameter-stratify-from-method-train-test-split-scikit-learn
x_train, x_test, y_train, y_test = train_test_split(input, df_select['target'], test_size=0.2, random_state=42, stratify=df_select['target'], shuffle=True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


In [None]:
#Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(x_train, y_train)
logreg.score(x_train, y_train)

(0.7642857142857142, 0.7642857142857142)

In [None]:
#überprüfung der Vorhersage
y_pred = logreg.predict(x_train)
np.sum(y_pred == y_train) / y_train.shape[0] 

0.7642857142857142

In [None]:
#erstellen der Confusion tabelle
feature_name = input.columns.values
summary = pd.DataFrame(data=feature_name, columns=["feature"])
summary['coefficient'] = logreg.coef_.reshape(-1)


In [None]:
#Achsenabschnitt hinzufügen
summary.loc[-1] = ['intercept', logreg.intercept_[0]]  # adding a row
summary.index = summary.index + 1  # shifting index
summary = summary.sort_index()  # sorting by index


In [None]:
#Odds Ratio hinzufügen
#wenn die Wahrscheinlichkeit 5/1 und die Odds Ratio 2 ist, für eine Einheitsänderung des Eingabe steigen die Wahrscheinlichkeit 2*5/1
summary['odds_ratio'] = np.exp(summary['coefficient'])

In [None]:
summary.sort_values(by='odds_ratio', ascending=False)

Unnamed: 0,feature,coefficient,odds_ratio
8,reason_group_accident,3.130692,22.889822
6,reason_group_sickness,2.76537,15.884919
9,reason_group_other,0.953105,2.593752
7,reason_group_pregnancy,0.763485,2.145741
1,Transportation Expense,0.5974,1.817387
4,Children,0.465434,1.592705
3,Body Mass Index,0.294789,1.342843
11,higher_education,0.202941,1.225001
2,Age,-0.154169,0.857127
10,day_of_week,-0.246102,0.781842


In [None]:
#model evaluieren
logreg.score(x_test, y_test)

0.7642857142857142

In [None]:
#die Wahrscheinlichkeit die Mitarbeiterin abwesent ist
y_pred_proba = logreg.predict_proba(x_test)[:, 1]
