# FMZ Übung 5: **Lineare und Nichtlineare Regression**

---
### Ziel der Übung
- Unterschied zwischen linearer und nichtlinearer Regression verstehen
- Modelle schätzen, interpretieren & visualisieren
- Modelle auf synthetische & reale Daten anwenden
- Modellgüte beurteilen (R², Residuenanalyse, Overfitting)
- Erweiterung: ML-Regressionsmethoden (Random Forest, SVR, Neural Net)


---
### Benötigte Pakete

```{r, eval=FALSE}
 # install.packages(c("randomForest","e1071","neuralnet"))
 library(ggplot2)
 library(neuralnet)
 library(randomForest)
 library(e1071)
```
---


In [None]:

 install.packages(c("randomForest","e1071","neuralnet"))
 library(ggplot2)
 library(neuralnet)
 library(randomForest)
 library(e1071)


## Aufgabe 1: **Einfache lineare Regression**

---
### Datensatz simulieren
```{r, eval=FALSE}
 set.seed(123)
 n <- 500
 x <- runif(n,0,10)
 y <- 2 + 0.8*x + rnorm(n,0,1)
 data_lin <- data.frame(x,y)
 head(data_lin)
```

---
### Plot der Daten
```{r, eval=FALSE}
 plot(x,y)
```

---
### Einfache lineare Regression
```{r, eval=FALSE}
 model_lin <- lm(y ~ x, data=data_lin)
 summary(model_lin)
```
---
### Plot des linearen Modells
```{r, eval=FALSE}
 plot(data_lin$x,data_lin$y)
 abline(model_lin,col=2,lwd=2)

 ggplot(data_lin, aes(x,y)) +          # mit ggplot
   geom_point() +
   geom_smooth(method="lm", se=FALSE, color="blue") +
   labs(title="Einfache lineare Regression")

 hist(residuals(model_lin), freq = F, main = "Histogramm der Residuen")
 curve(dnorm(x, mean(residuals(model_lin)), sd(residuals(model_lin))), col = 2, lwd = 2, add = T)

 qqnorm(residuals(model_lin))
 qqline(residuals(model_lin), col="red", lwd=2)

 shapiro.test(residuals(model_lin))
```
---

### Aufgaben

- Interpretieren Sie die Regressionskoeffizienten
- Was bedeutet der p-Wert?
- Ist das Modell gut geeignet (R²)?

---

In [None]:

 set.seed(123)
 n <- 500
 x <- runif(n,0,10)
 y <- 2 + 0.8*x + rnorm(n,0,1)
 data_lin <- data.frame(x,y)

 model_lin <- lm(y ~ x, data=data_lin)
 summary(model_lin)
 plot(data_lin)
 abline(model_lin,col=2,lwd=2)

 ggplot(data_lin, aes(x,y)) +              # mit ggplot
   geom_point() +
   geom_smooth(method="lm", se=FALSE, color="blue") +
   labs(title="Einfache lineare Regression")

 hist(residuals(model_lin), freq = F, main = "Histogramm der Residuen")
 curve(dnorm(x, mean(residuals(model_lin)), sd(residuals(model_lin))), col = 2, lwd = 2, add = T)
 qqnorm(residuals(model_lin))
 qqline(residuals(model_lin), col="red", lwd=2)

 shapiro.test(residuals(model_lin))


## Aufgabe 2: **Nichtlineare Daten → Versagen der linearen Regression**

---
### Nichtlinearer Datensatz simulieren
```{r, eval=FALSE}
 set.seed(123)
 x <- seq(0,5,length.out=n)
 y <- 3*sin(2*x) + rnorm(n,0,0.3)
 data_nonlin <- data.frame(x,y)
 head(data_nonlin)
```

---
### Einfache lineare Regression
```{r, eval=FALSE}
 model_lin_bad <- lm(y ~ x, data=data_nonlin)
 summary(model_lin_bad)
```

---
### Plot
```{r, eval=FALSE}
 ggplot(data_nonlin, aes(x,y)) +          
    geom_point() +
    geom_smooth(method="lm", se=FALSE, color="blue") +
    labs(title="Lineare Regression auf nichtlinearen Daten")
 hist(residuals(model_lin_bad), freq = F, main = "Histogramm der Residuen")
 curve(dnorm(x, mean(residuals(model_lin_bad)), sd(residuals(model_lin_bad))), col = 2, lwd = 2, add = T)

 qqnorm(residuals(model_lin_bad))
 qqline(residuals(model_lin_bad), col="red", lwd=2)

 shapiro.test(residuals(model_lin_bad))
```
---

### Aufgaben
- Warum passt das Modell schlecht?
- Welche Struktur hat der Fehler?

---

In [None]:

 set.seed(123)
 x <- seq(0,5,length.out=n)
 y <- 3*sin(2*x) + rnorm(n,0,1)
 data_nonlin <- data.frame(x,y)
 head(data_nonlin)
 model_lin_bad <- lm(y ~ x, data=data_nonlin)
 summary(model_lin_bad)

 ggplot(data_nonlin, aes(x,y)) +
   geom_point() +
   geom_smooth(method="lm", se=FALSE, color="blue") +
   labs(title="Lineare Regression auf nichtlinearen Daten")

 hist(residuals(model_lin_bad), freq = F, main = "Histogramm der Residuen")
 curve(dnorm(x, mean(residuals(model_lin_bad)), sd(residuals(model_lin_bad))), col = 2, lwd = 2, add = T)

 qqnorm(residuals(model_lin_bad))
 qqline(residuals(model_lin_bad), col="red", lwd=2)

 shapiro.test(residuals(model_lin_bad))


## Aufgabe 3: **Nichtlineare Regression - Polynommodell (`poly`)**

---
### Polynomregression
```{r, eval=FALSE}
 grad <- 3
 model_poly <- lm(y ~ poly(x,grad), data=data_nonlin)
 summary(model_poly)
```

---
### Plot
```{r, eval=FALSE}
 ggplot(data_nonlin,aes(x,y)) +
   geom_point(alpha=.5) +
   geom_smooth(method="lm", formula=y~poly(x,grad), se=FALSE, color="blue") +
   labs(title=paste("Polynomregression (Grad ",grad,") - R²=",round(summary(model_poly)$r.squared,2),sep=""))
```
---
### Aufgaben
- Probieren Sie unterschiedliche Grade 2, 4, 6, 8, 10 → Overfitting beobachten
---

In [None]:

par(mfrow=c(2,2))
for(grad in c(2,4,6,8)){
 model_poly <- lm(y ~ poly(x,grad), data=data_nonlin)
 summary(model_poly)

 print(ggplot(data_nonlin,aes(x,y)) +
   geom_point(alpha=.5) +
   geom_smooth(method="lm", formula=y~poly(x,grad), se=FALSE, color="blue") +
   labs(title=paste("Polynomregression (Grad",grad,") - R² =",round(summary(model_poly)$r.squared,2))))}


## Aufgabe 4: **Nichtlineare Regression (`nls`)**


---
### Nichtlineare Regression
```{r, eval=FALSE}
 mod_NLS <- nls(y ~ a * sin(b*x), data=data_nonlin, start=list(a=4,b=1.5))
 summary(mod_NLS)
```

---
### Plot
```{r, eval=FALSE}
 plot(data_nonlin$x,data_nonlin$y)
 lines(x, predict(mod_NLS), col=2,lwd=2)
```

---
### Aufgaben
- Ändern Sie die Anfangswerte der Parameter im Befehl `NLS`. Was fällt Ihnen auf?
---


In [None]:

 mod_NLS <- nls(y ~ a * sin(b*x), data=data_nonlin, start=list(a=4,b=1.5))
 summary(mod_NLS)
 plot(data_nonlin$x,data_nonlin$y)
 lines(x, predict(mod_NLS), col=2,lwd=2)


## Aufgabe 5: **Machine-Learning-Regressoren (Kurz-Demo)**


---
### R²-Funktion
```{r, eval=FALSE}
 R2 <- function(y_pred) {
   1 - sum((y_pred - y)^2) / sum((y - mean(y))^2)}
```

---
### Random Forest Regressor
```{r, eval=FALSE}
 set.seed(123)
 rf <- randomForest(y~x,data_nonlin)
 plot(x,y)
 lines(x,predict(rf),col="blue",lwd=2)
 R2(predict(rf))
```

---
### Support Vector Regression
```{r, eval=FALSE}
  <- svm(y~x,data_nonlin)
 lines(x,predict(svm),col="red",lwd=2)
 R2(predict(svm))
```

---
### Neuronales Netz
```{r, eval=FALSE}
 nn <- neuralnet(y~x, data_nonlin, hidden=2)
 lines(x,predict(nn, newdata=data_nonlin),col="green",lwd=2)
 R2(predict(nn, newdata=data_nonlin))
```
---
### Aufgaben
- Welcher Algorithmus passt am besten?
- Ändern Sie die Hyperparameter der ML-Regressoren (Anzahl der Bäume `ntree`, Kernelart `Kernel`, Anzahl der Schichten und Neuronen `hidden`, usw.). Was fällt Ihnen auf?
---


In [None]:
R2 <- function(y_pred) {
  1 - sum((y_pred - y)^2) / sum((y - mean(y))^2)}

set.seed(123)
rf <- randomForest(y~x,data_nonlin)
plot(x,y)
lines(x,predict(rf),col="blue",lwd=2)
R2(predict(rf))

svm <- svm(y~x,data_nonlin)
lines(x,predict(svm),col="red",lwd=5)
R2(predict(svm))

nn <- neuralnet(y~x, data_nonlin, hidden=2)
lines(x,predict(nn, newdata=data_nonlin),col="green",lwd=5)
R2(predict(nn, newdata=data_nonlin))


## Zusätzliche Aufgabe

---
### Laden des Fußgängerdynamik-Datensatzes
 ```{r, eval=FALSE}
 set.seed(123)
 ped <- read.table("https://raw.githubusercontent.com/antoinetordeux/Datasets/refs/heads/main/ped_data.txt", header=TRUE)
 head(ped)
 attach(ped)
 ```

### Plotfunktion
 ```{r, eval=FALSE}
 plot_res <- function(algo,title){
   plot(Spacing,Speed,xlab='Spacing',ylab='Speed',main=title)
   y_pred <- predict(algo, ped)
   lines(sort(Spacing), y_pred[order(Spacing)], col=2,lwd=2)
   legend("bottomright", paste("R² =",round(1 - var(y_pred-Speed)/var(Speed),2)),bty='n')
   print(summary(algo))
 }
```


---
### Einfache lineare Regression
 ```{r, eval=FALSE}
 m_ped <- lm(Speed ~ Spacing, data=ped)
 par(mfrow=c(2,2))
 plot_res(m_ped,"Einfache lineare Regression")
```

---
### Nichtlineare Regression
```{r, eval=FALSE}
 nl-model <- function(x,v0,l0,T,eps){
    eps/T*log(1+exp(-log(1+exp(-(x-l0-v0*T)/eps))+v0*T/eps))}
    
 m_ped_NLS <- nls(Speed~nl_model(Spacing,v0,l0,T,eps),
          start=list(v0=1.2,l0=0.3,T=.8,eps=.01))
 plot_res(m_ped_NLS,"Nichtlineares Modell")
```

---
### Multiple lineare Regression
 ```{r, eval=FALSE}
 m_ped_multi <- lm(Speed ~ Spacing + Spacing_pred + Speed_pred + Acceleration, data=ped)
 plot_res(m_ped_multi,"Multiple lineare Regression")
```

---
### Random Forest Regressor
 ```{r, eval=FALSE}
 m_ped_rf  <- randomForest(Speed ~ Spacing + Spacing_pred + Speed_pred + Acceleration, data=ped)
 plot_res(m_ped_rf,"Random Forest")
```

---
### Support Vector Regression
 ```{r, eval=FALSE}
 m_ped_svm <- svm(Speed ~ Spacing + Spacing_pred + Speed_pred + Acceleration, data=ped)
 plot_res(m_ped_svm,"SVM")
```

---
### Neuronales Netz
 ```{r, eval=FALSE}
 m_nn <- neuralnet(Speed ~ Spacing + Spacing_pred + Speed_pred + Acceleration, data=ped, hidden=2)
 plot_res(m_nn,"Neural Net")
```

---
### Aufgabe
- Welcher Algorithmus passt am besten?
- Implementieren Sie einfache Regressionen zwischen `Speed` und `Spacing` mit dem Random-Forest-Algorithmus, der SVM und dem neuronalen Netzwerk. Was fällt Ihnen auf?


In [None]:

 ped <- read.table("https://raw.githubusercontent.com/antoinetordeux/Datasets/refs/heads/main/ped_data.txt", header=TRUE)
 head(ped)
 attach(ped)
 plot_res <- function(algo,title){
   plot(Spacing,Speed,xlab='Spacing',ylab='Speed',main=title)
   y_pred <- predict(algo, ped)
   lines(sort(Spacing), y_pred[order(Spacing)], col=2,lwd=2)
   legend("bottomright", paste("R² =",round(1 - var(y_pred-Speed)/var(Speed),2)),bty='n')
   print(summary(algo))
 }

set.seed(123)
m_ped <- lm(Speed ~ Spacing, data=ped)
par(mfrow=c(2,2))
plot_res(m_ped,"Einfache lineare Regression")

 m_ped_multi <- lm(Speed ~ Spacing + Spacing_pred + Speed_pred + Acceleration, data=ped)
 plot_res(m_ped_multi,"Multiple lineare Regression")

  nl_model <- function(x,v0,l0,T,eps){
    eps/T*log(1+exp(-log(1+exp(-(x-l0-v0*T)/eps))+v0*T/eps))
}
 m_ped_NLS <- nls(Speed~nl_model(Spacing,v0,l0,T,eps),
          start=list(v0=1.2,l0=0.3,T=.8,eps=.01))
 plot_res(m_ped_NLS,"Nichtlineares Modell")

m_ped_rf  <- randomForest(Speed ~ Spacing + Spacing_pred + Speed_pred + Acceleration, data=ped)
plot_res(m_ped_rf,"Random Forest")

m_ped_svm <- svm(Speed ~ Spacing + Spacing_pred + Speed_pred + Acceleration, data=ped)
plot_res(m_ped_svm,"SVM")

m_nn <- neuralnet(Speed ~ Spacing + Spacing_pred + Speed_pred + Acceleration, data=ped, hidden=2)
plot_res(m_nn,"Neural Net")
