In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the data
data = pd.read_csv('/content/student_dropout_data.csv')

# Prepare the features and target
X = data.drop(['StudentID', 'DropoutStatus'], axis=1)
y = data['DropoutStatus']

# Encode categorical variables
X = pd.get_dummies(X, columns=['Gender', 'ParentEducationLevel'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': abs(model.coef_[0])
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

# Function to predict dropout for a new student
def predict_dropout(student_data):
    student_df = pd.DataFrame([student_data])
    student_encoded = pd.get_dummies(student_df, columns=['Gender', 'ParentEducationLevel'])
    student_encoded = student_encoded.reindex(columns=X.columns, fill_value=0)
    student_scaled = scaler.transform(student_encoded)
    prediction = model.predict(student_scaled)
    probability = model.predict_proba(student_scaled)[0][1]
    return prediction[0], probability

# Create a new student data point
new_student = {
    'Age': 14,
    'Gender': 'M',
    'Standard': 9,
    'AttendanceRate': 0.85,
    'AverageMarks': 65,
    'BehaviorIncidents': 2,
    'Extracurriculars': 1,
    'ParentEducationLevel': 'HighSchool'
}

dropout, prob = predict_dropout(new_student)
print(f"\nNew student dropout prediction: {'Yes' if dropout else 'No'}")
print(f"Probability of dropout: {prob:.2f}")

Accuracy: 0.72

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.99      0.84      2883
           1       0.54      0.03      0.05      1117

    accuracy                           0.72      4000
   macro avg       0.63      0.51      0.44      4000
weighted avg       0.67      0.72      0.62      4000


Feature Importance:
                              feature  importance
4                   BehaviorIncidents    0.275746
2                      AttendanceRate    0.275435
3                        AverageMarks    0.260624
12  ParentEducationLevel_MiddleSchool    0.039086
9      ParentEducationLevel_Doctorate    0.029816
8       ParentEducationLevel_Bachelor    0.026466
10    ParentEducationLevel_HighSchool    0.023766
11        ParentEducationLevel_Master    0.018782
6                            Gender_F    0.013384
7                            Gender_M    0.013384
0                                 Age    0.010873
1            

[link text](https://)