In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report, precision_score, recall_score, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold

# Load the dataset
file_path = 'data_with_distances.xlsx'
updated_data = pd.read_excel(file_path)

# Prepare the data for machine learning model
features = updated_data[['Semester', 'InfoVeranstaltung', 'Note HZB', 'Note Bachelor', 'ECTS Bachelor', 'Distance_to_71034']]
features = features.copy()
features.loc[:, 'InfoVeranstaltung'] = features['InfoVeranstaltung'].map({'ja': 1, 'nein': 0})
target = updated_data['Immatrikulation'].map({'Ja': 1, 'Nein': 0})
features_filled = features.apply(lambda x: pd.to_numeric(x, errors='coerce')).fillna(features.mean())

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(features_filled, target, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to plot learning curves
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

# Feature correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(features_filled.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

# Dictionary to store model results
model_results = {}

# Function to evaluate and store model results
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    model_results[model_name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'roc_auc': roc_auc,
        'fpr': fpr,
        'tpr': tpr
    }
    
    print(f"{model_name} Accuracy Score: {accuracy}")
    print(f"{model_name} Classification Report:\n", classification_report(y_test, y_pred))
    
    # Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(conf_matrix, display_labels=['Nein', 'Ja']).plot(cmap='viridis')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.show()
    
    # Learning Curve
    plot_learning_curve(model, f'Learning Curve for {model_name}', X_train_scaled, y_train)
    plt.show()

# Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}
log_reg = LogisticRegression()
grid_search_lr = GridSearchCV(log_reg, param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train_scaled, y_train)
best_lr_model = grid_search_lr.best_estimator_
print("Best Parameters for Logistic Regression:", grid_search_lr.best_params_)
evaluate_model(best_lr_model, X_test_scaled, y_test, "Logistic Regression")

# Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}
rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
best_rf_model = grid_search_rf.best_estimator_
print("Best Parameters for Random Forest:", grid_search_rf.best_params_)
evaluate_model(best_rf_model, X_test, y_test, "Random Forest")

# Feature importance for Random Forest
feature_importance_rf = pd.DataFrame({
    'Feature': features_filled.columns,
    'Importance': best_rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_rf, palette='viridis')
plt.title('Random Forest Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# Gradient Boosting
param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}
gb = GradientBoostingClassifier(random_state=42)
grid_search_gb = GridSearchCV(gb, param_grid_gb, cv=5, scoring='accuracy')
grid_search_gb.fit(X_train_scaled, y_train)
best_gb_model = grid_search_gb.best_estimator_
print("Best Parameters for Gradient Boosting:", grid_search_gb.best_params_)
evaluate_model(best_gb_model, X_test_scaled, y_test, "Gradient Boosting")

# K-Nearest Neighbors
param_grid_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}
knn = KNeighborsClassifier()
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='accuracy')
grid_search_knn.fit(X_train_scaled, y_train)
best_knn_model = grid_search_knn.best_estimator_
print("Best Parameters for KNN:", grid_search_knn.best_params_)
evaluate_model(best_knn_model, X_test_scaled, y_test, "KNN")

# Support Vector Machine
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}
svm = SVC(probability=True, random_state=42)
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train_scaled, y_train)
best_svm_model = grid_search_svm.best_estimator_
print("Best Parameters for SVM:", grid_search_svm.best_params_)
evaluate_model(best_svm_model, X_test_scaled, y_test, "SVM")

# LightGBM
param_grid_lgbm = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [7, 15, 31]  # Adjusted values
}
lgbm = LGBMClassifier(random_state=42, min_child_samples=1)
grid_search_lgbm = GridSearchCV(lgbm, param_grid_lgbm, cv=5, scoring='accuracy')
grid_search_lgbm.fit(X_train, y_train)
best_lgbm_model = grid_search_lgbm.best_estimator_
print("Best Parameters for LightGBM:", grid_search_lgbm.best_params_)
evaluate_model(best_lgbm_model, X_test, y_test, "LightGBM")

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('lr', best_lr_model),
        ('rf', best_rf_model),
        ('gb', best_gb_model),
        ('svm', best_svm_model),
        ('lgbm', best_lgbm_model)
    ],
    voting='soft'
)
voting_clf.fit(X_train_scaled, y_train)
evaluate_model(voting_clf, X_test_scaled, y_test, "Voting Classifier")

# Model Comparison
plt.figure(figsize=(12, 6))
metrics = ['accuracy', 'precision', 'recall', 'roc_auc']
for metric in metrics:
    values = [results[metric] for results in model_results.values()]
    plt.bar(model_results.keys(), values, label=metric)

plt.title('Model Comparison')
plt.xlabel('Models')
plt.ylabel('Score')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# ROC Curve for all models
plt.figure(figsize=(10, 8))
for model_name, results in model_results.items():
    plt.plot(results['fpr'], results['tpr'], label=f'{model_name} (AUC = {results["roc_auc"]:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Hyperparameter tuning visualization for Random Forest
rf_results = grid_search_rf.cv_results_
plt.figure(figsize=(15, 5))
plt.subplot(131)
plt.scatter(rf_results['param_n_estimators'], rf_results['mean_test_score'])
plt.xlabel('n_estimators')
plt.ylabel('Mean test score')
plt.title('Effect of n_estimators on RF performance')

plt.subplot(132)
plt.scatter(rf_results['param_max_depth'], rf_results['mean_test_score'])
plt.xlabel('max_depth')
plt.ylabel('Mean test score')
plt.title('Effect of max_depth on RF performance')

plt.subplot(133)
plt.scatter(rf_results['param_min_samples_split'], rf_results['mean_test_score'])
plt.xlabel('min_samples_split')
plt.ylabel('Mean test score')
plt.title('Effect of min_samples_split on RF performance')

plt.tight_layout()
plt.show()