# One Hot Encoding

https://www.ritchieng.com/machinelearning-one-hot-encoding/

In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics

In [81]:
# Aktuelle Pfade
file_paths = [
     "data/rad_2022_tage_19_06_23_r.csv",
     "data/rad_2021_tage_19_06_23_r.csv",
     "data/rad_2020_tage_19_06_23_r.csv",
     "data/rad_2019_tage_19_06_23_r.csv",
     "data/rad_2018_tage_19_06_23_r.csv",
     "data/rad_2017_tage_19_06_23_r.csv",
     "data/rad_2016_tage_19_06_23_r.csv",
     "data/rad_2015_tage_19_06_23_r.csv",
     "data/rad_2014_tage_19_06_23_r.csv",
     "data/rad_2013_tage_19_06_23_r.csv",
     "data/rad_2012_tage_19_06_23_r.csv",
     "data/rad_2011_tage_19_06_23_r.csv",
     "data/rad_2010_tage_19_06_23_r.csv",
     "data/rad_2009_tage_19_06_23_r.csv",
     "data/rad_2008_tage_19_06_23_r.csv"
]

In [110]:
df_list = []

for file_path in file_paths:
    df_temp = pd.read_csv(file_path)
    df_list.append(df_temp)

# Einzelne DataFrames zu einem Gesamt-DataFrame zusammenführen
df = pd.concat(df_list, ignore_index=True)

In [111]:
df.head()

Unnamed: 0,datum,zaehlstelle,uhrzeit_start,uhrzeit_ende,richtung_1,richtung_2,gesamt,min.temp,max.temp,niederschlag,bewoelkung,sonnenstunden,kommentar
0,2022-01-01,Arnulf,00:00,23:59,444.0,0.0,444.0,3.2,13.4,0.0,68,7.6,
1,2022-01-02,Arnulf,00:00,23:59,476.0,0.0,476.0,1.9,15.8,0.0,80,4.7,
2,2022-01-03,Arnulf,00:00,23:59,752.0,0.0,752.0,8.7,11.5,0.7,99,0.0,
3,2022-01-04,Arnulf,00:00,23:59,603.0,0.0,603.0,3.7,16.5,10.3,98,1.0,
4,2022-01-05,Arnulf,00:00,23:59,578.0,0.0,578.0,0.3,3.9,1.2,95,0.4,


### https://www.kdnuggets.com/2023/07/pandas-onehot-encode-data.html#:~:text=One%2Dhot%20encoding%20is%20a,values%20into%20compatible%20numerical%20representations.&text=For%20example%20for%20this%20dummy,to%20be%20in%20numerical%20form.

#### out of the categorical data we have to creat numerical data

#### in my case:
#### 12 variables -> 12 months

In [84]:
# Annahme: Ihr DataFrame heißt df und die Spalte mit den Datumsangaben ist 'datum'
# Wenn die Spalte bereits als Datum formatiert ist, können Sie den folgenden Schritt überspringen:
df['datum'] = pd.to_datetime(df['datum'])

# Extrahieren Sie den Monat und erstellen Sie eine neue Spalte namens 'monat'
df['monat'] = df['datum'].dt.strftime('%B')

# Alternativ können Sie den Monat auch als Zahl extrahieren (1 für Januar, 2 für Februar, usw.)
# df['monat'] = df['datum'].dt.month

# Nun enthält die Spalte 'monat' den Namen des Monats
print(df)


           datum zaehlstelle uhrzeit_start uhrzeit_ende  richtung_1  \
0     2022-01-01      Arnulf         00:00        23:59       444.0   
1     2022-01-02      Arnulf         00:00        23:59       476.0   
2     2022-01-03      Arnulf         00:00        23:59       752.0   
3     2022-01-04      Arnulf         00:00        23:59       603.0   
4     2022-01-05      Arnulf         00:00        23:59       578.0   
...          ...         ...           ...          ...         ...   
24072 2012-12-27     Olympia         00:00        23:59       204.0   
24073 2012-12-28     Olympia         00:00        23:59       210.0   
24074 2012-12-29     Olympia         00:00        23:59       367.0   
24075 2012-12-30     Olympia         00:00        23:59       412.0   
24076 2012-12-31     Olympia         00:00        23:59       514.0   

       richtung_2  gesamt  min.temp  max.temp  niederschlag  bewoelkung  \
0             0.0   444.0       3.2      13.4           0.0          68 

# limit to categorical data using df.select_dtypes()

In [39]:
df = df.select_dtypes(include=[object])
df.head(3)

Unnamed: 0,zaehlstelle,uhrzeit_start,uhrzeit_ende,kommentar,monat
0,Arnulf,00:00,23:59,,January
1,Arnulf,00:00,23:59,,January
2,Arnulf,00:00,23:59,,January


In [85]:
df = df.drop(['richtung_1', 'richtung_2', 'uhrzeit_start', 'uhrzeit_ende', 'kommentar'], axis=1)

In [86]:
df

Unnamed: 0,datum,zaehlstelle,gesamt,min.temp,max.temp,niederschlag,bewoelkung,sonnenstunden,monat
0,2022-01-01,Arnulf,444.0,3.2,13.4,0.0,68,7.6,January
1,2022-01-02,Arnulf,476.0,1.9,15.8,0.0,80,4.7,January
2,2022-01-03,Arnulf,752.0,8.7,11.5,0.7,99,0.0,January
3,2022-01-04,Arnulf,603.0,3.7,16.5,10.3,98,1.0,January
4,2022-01-05,Arnulf,578.0,0.3,3.9,1.2,95,0.4,January
...,...,...,...,...,...,...,...,...,...
24072,2012-12-27,Olympia,406.0,4.4,9.3,8.7,85,0.0,December
24073,2012-12-28,Olympia,429.0,-1.4,6.5,0.6,80,0.0,December
24074,2012-12-29,Olympia,747.0,-3.1,7.0,0.0,64,5.9,December
24075,2012-12-30,Olympia,849.0,-3.3,9.2,0.0,34,3.5,December


In [87]:
df.shape

(24077, 9)

In [89]:
df.columns

Index(['datum', 'zaehlstelle', 'gesamt', 'min.temp', 'max.temp',
       'niederschlag', 'bewoelkung', 'sonnenstunden', 'monat'],
      dtype='object')

In [90]:
df = df.drop(['gesamt','zaehlstelle', 'datum', 'monat'], axis=1)

In [91]:
df

Unnamed: 0,min.temp,max.temp,niederschlag,bewoelkung,sonnenstunden
0,3.2,13.4,0.0,68,7.6
1,1.9,15.8,0.0,80,4.7
2,8.7,11.5,0.7,99,0.0
3,3.7,16.5,10.3,98,1.0
4,0.3,3.9,1.2,95,0.4
...,...,...,...,...,...
24072,4.4,9.3,8.7,85,0.0
24073,-1.4,6.5,0.6,80,0.0
24074,-3.1,7.0,0.0,64,5.9
24075,-3.3,9.2,0.0,34,3.5


In [92]:
# Umwandeln der 'monat' Spalte in One-Hot-Vektoren
encoder = OneHotEncoder(sparse_output=False)
monat_one_hot = encoder.fit_transform(df['monat'].values.reshape(-1,1))

# Umwandeln der One-Hot-Vektoren zu einem DataFrame
monat_df = pd.DataFrame(monat_one_hot, columns=['monat_'+ str(i) for i in range(monat_one_hot.shape[1])])

# Zusammenführen der DataFrames
df_encoded = pd.concat([df, monat_df], axis=1)

df_encoded.select_dtypes()

KeyError: 'monat'

In [93]:
# TODO: create a LabelEncoder object and fit it to each feature in X


# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()


# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
df_2 = df.apply(le.fit_transform)
df_2.head()

Unnamed: 0,min.temp,max.temp,niederschlag,bewoelkung,sonnenstunden
0,167,194,0,54,76
1,154,218,0,64,47
2,222,175,7,79,0
3,172,225,103,78,10
4,138,99,12,76,4


In [94]:

# TODO: create a OneHotEncoder object, and fit it to all of X

# 1. INSTANTIATE
enc = preprocessing.OneHotEncoder()

# 2. FIT
enc.fit(df_2)

# 3. Transform
onehotlabels = enc.transform(df_2).toarray()
onehotlabels.shape

# as you can see, you've the same number of rows 24077
# but now you've so many more columns (18 instead of 2) due to how we changed all the categorical data into numerical data

(24077, 1248)

In [95]:
onehotlabels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [96]:
type(onehotlabels)

numpy.ndarray

In [113]:
# target
# Hier verwenden wir np.random.seed, um die Zufälligkeit zu steuern und die Ergebnisse reproduzierbar zu machen
#np.random.seed(42)

# Kopieren Sie einige Werte aus der Spalte 'gesamt' als Beispielzielvariablen
#num_samples = 24077  # Anzahl der Beispiele, die Sie kopieren möchten
#target = df['gesamt'].sample(num_samples).values
target = df['gesamt'] = df['gesamt'].fillna(df['gesamt'].mean())


# Annahme: Ihre Zielvariable ist 'target'
#target = np.array([1, 0, 1])  # Beispielzielvariablen


In [116]:
# Annahme: Ihr One-Hot-kodiertes Array heißt 'one_hot_encoded_data'
# Hier verwenden wir ein Dummy-Array für Illustrationszwecke
one_hot_encoded_data = onehotlabels

# Anwendung auf ein einfaches Modell (z.B., Logistische Regression)


# Teilen Sie die Daten in Trainings- und Testsets auf
X_train, X_test, y_train, y_test = train_test_split(one_hot_encoded_data, target, test_size=0.2, random_state=42)

# Erstellen Sie ein Logistisches Regressionsmodell
model = LogisticRegression()

# Trainieren Sie das Modell
model.fit(X_train, y_train)

# Bewerten Sie die Modellleistung
accuracy = model.score(X_test, y_test)
print(f'Genauigkeit des Modells: {accuracy}')


Genauigkeit des Modells: 0.009136212624584718


In [115]:
# Definiere deine Features und Target Variablen
features = df[['min.temp', 'max.temp', 'niederschlag', 'bewoelkung', 'sonnenstunden']]
target = df['gesamt']

# Teile deine Daten in Trainings- und Testdaten auf
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialisiere das Lineare Regression Modell
model = LinearRegression()  

# Trainiere das Modell mit den Trainingsdaten
model.fit(features_train, target_train) 

# Nutze das Modell um eine Vorhersage auf den Testdaten zu erzeugen
prediction = model.predict(features_test)

# Du kannst nun die Metriken deines Modells ausgeben z.B. den MSE (Mean squared error)
print('Mean Squared Error:', metrics.mean_squared_error(target_test, prediction))  

KeyError: "None of [Index(['min.temp', 'max.temp', 'niederschlag', 'bewoelkung', 'sonnenstunden'], dtype='object')] are in the [columns]"

In [114]:
# Sample DataFrame (replace this with your actual data)
data = {'monat': np.random.rand(24077), 'zaehlstelle': np.random.rand(24077), 'gesamt': np.random.randint(0, 100, size=24077)}
df = pd.DataFrame(data)

# One-Hot Encoding (replace this with your actual encoding method)
one_hot_encoded_data = pd.get_dummies(df[['monat', 'zaehlstelle']])

# Sample target variable (matching the number of rows in one_hot_encoded_data)
target = np.random.randint(0, 100, size=len(one_hot_encoded_data))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(one_hot_encoded_data, target, test_size=0.2, random_state=42)

# Create a Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
accuracy = model.score(X_test, y_test)
print(f'R-squared value of the model: {accuracy}')


R-squared value of the model: -3.950911247629918e-05


In [106]:
df

Unnamed: 0,monat,zaehlstelle,gesamt
0,0.819980,0.771345,10
1,0.158783,0.493706,52
2,0.539603,0.540253,97
3,0.942900,0.441541,42
4,0.793423,0.545784,91
...,...,...,...
24072,0.970002,0.186865,75
24073,0.589537,0.638911,42
24074,0.719064,0.291712,8
24075,0.755846,0.411124,89


In [108]:
# Umwandeln der 'monat' Spalte in One-Hot-Vektoren
encoder = OneHotEncoder(sparse_output=False)
monat_one_hot = encoder.fit_transform(df['monat'].values.reshape(-1,1))

# Umwandeln der One-Hot-Vektoren zu einem DataFrame
monat_df = pd.DataFrame(monat_one_hot, columns=encoder.get_feature_names(['monat']))

# Zusammenführen der DataFrames
df_encoded = pd.concat([df, monat_df], axis=1)

# Trennen in abhängige und unabhängige Variablen
y = df_encoded['gesamt']
X = df_encoded.drop(['gesamt', 'monat'], axis=1)

# Aufteilen der Daten in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Anpassen des Modells
model = LinearRegression()
model.fit(X_train, y_train)

# Vorhersage mit dem Modell
y_pred = model.predict(X_test)

AttributeError: 'OneHotEncoder' object has no attribute 'get_feature_names'