In [43]:
import pandas as pd
import numpy as np


In [44]:
raw = pd.read_csv("Absenteeism_data.csv")
raw.head()

Unnamed: 0,ID,Reason for Absence,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,11,26,07/07/2015,289,36,33,239.554,30,1,2,1,4
1,36,0,14/07/2015,118,13,50,239.554,31,1,1,0,0
2,3,23,15/07/2015,179,51,38,239.554,31,1,0,0,2
3,7,7,16/07/2015,279,5,39,239.554,24,1,2,0,4
4,11,23,23/07/2015,289,36,33,239.554,30,1,2,1,2


In [45]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         700 non-null    int64  
 1   Reason for Absence         700 non-null    int64  
 2   Date                       700 non-null    object 
 3   Transportation Expense     700 non-null    int64  
 4   Distance to Work           700 non-null    int64  
 5   Age                        700 non-null    int64  
 6   Daily Work Load Average    700 non-null    float64
 7   Body Mass Index            700 non-null    int64  
 8   Education                  700 non-null    int64  
 9   Children                   700 non-null    int64  
 10  Pets                       700 non-null    int64  
 11  Absenteeism Time in Hours  700 non-null    int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 65.8+ KB


In [46]:
raw["reason_group"] = pd.cut(
    x=raw["Reason for Absence"],
    bins=[-np.inf, 0, 14, 17, 21, np.inf],
    labels=["unknown", "sickness", "pregnancy", "accident", "other"],

)
 
reason_dummy = pd.get_dummies(raw["reason_group"], prefix="reason_group", drop_first=True)

In [47]:
df = pd.concat([raw, reason_dummy], axis=1)
#df.head(10)

In [48]:
df['timestamp'] = pd.to_datetime(df['Date'], dayfirst=True)
type(df['timestamp'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [49]:
df['month'] = df['timestamp'].dt.month
df['day_of_week'] = df['timestamp'].dt.day_of_week

In [50]:
df['higher_education'] = df['Education'].apply(lambda x: 1 if x > 1 else 0)


In [51]:
#target Einstufung
# der vorteil, der median zu benutyen ist, dass die Daten ausgleich geteilt werden 
benchmark = df['Absenteeism Time in Hours'].median()
df['target'] = df['Absenteeism Time in Hours'].apply(lambda x: 1 if x > benchmark else 0)
df.target.sum()/df.target.shape[0] # für logisticregression 40/60 aufgeteilt Daten funktioniert gut, aber für Neuralnetwork 50/50 ist besser

0.45571428571428574

In [52]:
df_select = df.copy()
df_select.drop(columns=['Date', 'ID', 'Reason for Absence', 'Absenteeism Time in Hours', 'reason_group', 'timestamp', 'Education'], inplace=True)
input = df_select.iloc[:, :-1]


In [76]:
column_scaled =['Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets',
       'month', 'day_of_week',
       'higher_education']

In [None]:
#die Daten satnardisieren
#dummy Spalten sollte  nicht skaliert werden, weil sie nicht interpretiert werden können
#normalerweise werden die Daten skaliert, vor dummy erstellung
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

input[column_scaled] = scaler.fit_transform(input[column_scaled])

In [80]:
#daten in train und test aufteilen und mischen
#standardmäßig schuffle = True
#stratify = True, damit die Verteilung der Zielvariable gleich bleibt
#stratofy link https://stackoverflow.com/questions/34842405/parameter-stratify-from-method-train-test-split-scikit-learn
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(input, df_select['target'], test_size=0.2, random_state=42, stratify=df_select['target'], shuffle=True)

In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


In [82]:
#Logistic Regression
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(x_train, y_train)
logreg.score(x_train, y_train), logreg.score(x_test, y_test)

(0.7607142857142857, 0.7642857142857142)

In [83]:
#überprüfung der Vorhersage
y_pred = logreg.predict(x_test)
np.sum(y_pred == y_test) / y_test.shape[0] 

0.7642857142857142

In [84]:
#erstellen der Confusion tabelle
feature_name = input.columns.values
summary = pd.DataFrame(data=feature_name, columns=["feature"])
summary['coefficient'] = logreg.coef_.reshape(-1)
summary

Unnamed: 0,feature,coefficient
0,Transportation Expense,0.581903
1,Distance to Work,0.023776
2,Age,-0.149923
3,Daily Work Load Average,-0.049957
4,Body Mass Index,0.283916
5,Children,0.467187
6,Pets,-0.312661
7,reason_group_sickness,2.785125
8,reason_group_pregnancy,0.757755
9,reason_group_accident,3.129479


In [85]:
#Achsenabschnitt hinzufügen
summary.loc[-1] = ['intercept', logreg.intercept_[0]]  # adding a row
summary.index = summary.index + 1  # shifting index
summary = summary.sort_index()  # sorting by index


In [86]:
#Odds Ratio hinzufügen
#wenn die Wahrscheinlichkeit 5/1 und die Odds Ratio 2 ist, für eine Einheitsänderung des Eingabe steigen die Wahrscheinlichkeit 2*5/1
summary['odds_ratio'] = np.exp(summary['coefficient'])