In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = 'data_with_distances.xlsx'  # Make sure to have the correct path
updated_data = pd.read_excel(file_path)

# Prepare the data for machine learning model
# Define the features (independent variables) and the target variable (Immatrikulation)
features = updated_data[['Semester', 'InfoVeranstaltung', 'Note HZB', 'Note Bachelor', 'ECTS Bachelor', 'Distance_to_71034']]

# Convert categorical features into numerical ones
features = features.copy()  # Avoid SettingWithCopyWarning
features.loc[:, 'InfoVeranstaltung'] = features['InfoVeranstaltung'].map({'ja': 1, 'nein': 0})

# Define the target variable (Immatrikulation)
target = updated_data['Immatrikulation'].map({'Ja': 1, 'Nein': 0})

# Handle missing values by filling them with the mean value of each column
features_filled = features.apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(features.mean())

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_filled, target, test_size=0.3, random_state=42)

# Standardize the numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning for Logistic Regression with GridSearchCV
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']  # solvers that support l1 and l2 penalties
}

log_reg = LogisticRegression()
grid_search_lr = GridSearchCV(log_reg, param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train_scaled, y_train)

# Best parameters for Logistic Regression
best_lr_model = grid_search_lr.best_estimator_
print("Best Parameters for Logistic Regression:", grid_search_lr.best_params_)

# Make predictions and evaluate the model
y_pred_lr = best_lr_model.predict(X_test_scaled)

# Evaluate the model's performance
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_recall = recall_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy Score:", lr_accuracy)
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))

# Create and visualize the confusion matrix for Logistic Regression
conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)
ConfusionMatrixDisplay(conf_matrix_lr, display_labels=['Nein', 'Ja']).plot(cmap='viridis')
plt.title('Confusion Matrix for Logistic Regression (Tuned)')
plt.show()

# Hyperparameter tuning for Random Forest with GridSearchCV
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# Best parameters for Random Forest
best_rf_model = grid_search_rf.best_estimator_
print("Best Parameters for Random Forest:", grid_search_rf.best_params_)

# Make predictions and evaluate the Random Forest model
y_pred_rf = best_rf_model.predict(X_test)

# Evaluate the Random Forest model's performance
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
print("Random Forest Accuracy Score:", rf_accuracy)
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

# Create and visualize the confusion matrix for Random Forest
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
ConfusionMatrixDisplay(conf_matrix_rf, display_labels=['Nein', 'Ja']).plot(cmap='viridis')
plt.title('Confusion Matrix for Random Forest (Tuned)')
plt.show()

# Extract feature importance from the Random Forest model
feature_importance_rf = pd.DataFrame({
    'Feature': features_filled.columns,
    'Importance': best_rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Visualize feature importance from Random Forest
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_rf, palette='viridis')
plt.title('Random Forest Feature Importance (Tuned)')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# Gradient Boosting Classifier
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

gb = GradientBoostingClassifier(random_state=42)
grid_search_gb = GridSearchCV(gb, param_grid_gb, cv=5, scoring='accuracy')
grid_search_gb.fit(X_train_scaled, y_train)

# Best parameters for Gradient Boosting
best_gb_model = grid_search_gb.best_estimator_
print("Best Parameters for Gradient Boosting:", grid_search_gb.best_params_)

# Make predictions and evaluate the Gradient Boosting model
y_pred_gb = best_gb_model.predict(X_test_scaled)

# Evaluate the Gradient Boosting model's performance
gb_accuracy = accuracy_score(y_test, y_pred_gb)
gb_precision = precision_score(y_test, y_pred_gb)
gb_recall = recall_score(y_test, y_pred_gb)
print("Gradient Boosting Accuracy Score:", gb_accuracy)
print("Gradient Boosting Classification Report:\n", classification_report(y_test, y_pred_gb))

# Create and visualize the confusion matrix for Gradient Boosting
conf_matrix_gb = confusion_matrix(y_test, y_pred_gb)
ConfusionMatrixDisplay(conf_matrix_gb, display_labels=['Nein', 'Ja']).plot(cmap='viridis')
plt.title('Confusion Matrix for Gradient Boosting')
plt.show()

# K-Nearest Neighbors Classifier
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # Manhattan or Euclidean distance
}

knn = KNeighborsClassifier()
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='accuracy')
grid_search_knn.fit(X_train_scaled, y_train)

# Best parameters for KNN
best_knn_model = grid_search_knn.best_estimator_
print("Best Parameters for KNN:", grid_search_knn.best_params_)

# Make predictions and evaluate the KNN model
y_pred_knn = best_knn_model.predict(X_test_scaled)

# Evaluate the KNN model's performance
knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_precision = precision_score(y_test, y_pred_knn)
knn_recall = recall_score(y_test, y_pred_knn)
print("KNN Accuracy Score:", knn_accuracy)
print("KNN Classification Report:\n", classification_report(y_test, y_pred_knn))

# Create and visualize the confusion matrix for KNN
conf_matrix_knn = confusion_matrix(y_test, y_pred_knn)
ConfusionMatrixDisplay(conf_matrix_knn, display_labels=['Nein', 'Ja']).plot(cmap='viridis')
plt.title('Confusion Matrix for KNN')
plt.show()

# Support Vector Machine Classifier
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

svm = SVC(probability=True, random_state=42)
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train_scaled, y_train)

# Best parameters for SVM
best_svm_model = grid_search_svm.best_estimator_
print("Best Parameters for SVM:", grid_search_svm.best_params_)

# Make predictions and evaluate the SVM model
y_pred_svm = best_svm_model.predict(X_test_scaled)

# Evaluate the SVM model's performance
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm)

SyntaxError: incomplete input (2270925483.py, line 200)