In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_csv('/content/dataset.csv')

In [3]:
df.columns

Index(['Marital status', 'Application mode', 'Application order', 'Course',
       'Daytime/evening attendance', 'Previous qualification', 'Nacionality',
       'Mother's qualification', 'Father's qualification',
       'Mother's occupation', 'Father's occupation', 'Displaced',
       'Educational special needs', 'Debtor', 'Tuition fees up to date',
       'Gender', 'Scholarship holder', 'Age at enrollment', 'International',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)

In [4]:
X = df.drop(columns=['Target'])  # Features
y = df['Target']  # Target

In [5]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

In [7]:
y_encoded[df['Target'].isnull()] = -1

In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.3, random_state=42)

In [10]:
model = LabelPropagation()

In [11]:
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_test)

  probabilities /= normalizer


In [13]:
y_pred_labels = encoder.inverse_transform(y_pred)

In [14]:
y_test_labels = encoder.inverse_transform(y_test[y_test != -1])

In [16]:
y_train_pred = model.predict(X_train)

In [18]:
print("Train Accuracy: ", accuracy_score(y_train, y_train_pred))

Train Accuracy:  1.0


In [21]:
print("\nClassification Report for Training Data:\n")
print(classification_report(y_test_labels, y_pred_labels))


Classification Report for Training Data:

              precision    recall  f1-score   support

     Dropout       0.73      0.65      0.69       441
    Enrolled       0.35      0.31      0.33       245
    Graduate       0.71      0.80      0.75       642

    accuracy                           0.66      1328
   macro avg       0.60      0.58      0.59      1328
weighted avg       0.65      0.66      0.65      1328



In [19]:
print("Test Accuracy: ", accuracy_score(y_test, y_pred))

Test Accuracy:  0.6573795180722891


In [30]:
precision_score(y_test, y_pred, average="weighted")

0.6991401381260441

In [31]:
recall_score(y_test, y_pred, average="weighted")

0.6831683168316832

In [32]:
f1_score(y_test, y_pred, average="weighted")

0.6902489075602619

In [15]:
print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))

Classification Report:
               precision    recall  f1-score   support

     Dropout       0.73      0.65      0.69       441
    Enrolled       0.35      0.31      0.33       245
    Graduate       0.71      0.80      0.75       642

    accuracy                           0.66      1328
   macro avg       0.60      0.58      0.59      1328
weighted avg       0.65      0.66      0.65      1328



In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.semi_supervised import LabelPropagation
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Feature and target preparation
features = df.drop(columns=['Target'])
target = df['Target']

import pandas as pd
import numpy as np

# Set a proportion of labeled data to be treated as unlabeled
unlabeled_fraction = 0.2  # 20% of labeled data will be "unlabeled"
labeled_data = df[df['Target'].notnull()]  # Labeled rows
unlabeled_data = df[df['Target'].isnull()]  # Unlabeled rows (initially empty)

# Randomly select a subset of labeled data to set as unlabeled
unlabeled_indices = labeled_data.sample(frac=unlabeled_fraction, random_state=42).index
df.loc[unlabeled_indices, 'Target'] = np.nan  # Set these rows' Target as NaN

# Re-split the data into labeled and unlabeled
labeled_data = df[df['Target'].notnull()]
unlabeled_data = df[df['Target'].isnull()]

# Separate features and targets
X_labeled = labeled_data.drop(columns=['Target'])
y_labeled = labeled_data['Target']
X_unlabeled = unlabeled_data.drop(columns=['Target'])

# Check the updated dataset
print(f"Labeled samples: {len(X_labeled)}, Unlabeled samples: {len(X_unlabeled)}")

# Split labeled data into training and testing sets
X_labeled = labeled_data.drop(columns=['Target'])
y_labeled = labeled_data['Target']
X_test, X_labeled, y_test, y_labeled = train_test_split(X_labeled, y_labeled, test_size=0.8, random_state=42)

# Unlabeled data
X_unlabeled = unlabeled_data.drop(columns=['Target'])

# Encode target variables
label_encoder = LabelEncoder()
y_labeled = label_encoder.fit_transform(y_labeled)
y_test = label_encoder.transform(y_test)  # Encode test labels

# Scale features
scaler = StandardScaler()
X_labeled_scaled = scaler.fit_transform(X_labeled)
X_test_scaled = scaler.transform(X_test)

# Supervised models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
}

results = {}

# Train and evaluate supervised models
for name, model in models.items():
    model.fit(X_labeled_scaled, y_labeled)
    y_pred = model.predict(X_test_scaled)
    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="weighted"),
        "Recall": recall_score(y_test, y_pred, average="weighted"),
        "F1-Score": f1_score(y_test, y_pred, average="weighted"),
    }

# Semi-Supervised Model
if X_unlabeled.empty:
    print("No unlabeled data found. Semi-supervised learning is not applicable.")
else:
    # Scale unlabeled data
    X_unlabeled_scaled = scaler.transform(X_unlabeled)

    # Semi-Supervised Model
    semi_supervised_model = LabelPropagation()
    X_combined = np.vstack((X_labeled_scaled, X_unlabeled_scaled))
    y_combined = np.concatenate((y_labeled, [-1] * len(X_unlabeled_scaled)))  # -1 for unlabeled targets

    # Train the semi-supervised model
    semi_supervised_model.fit(X_combined, y_combined)
    y_pred_semi = semi_supervised_model.predict(X_test_scaled)

    # Ensure consistent label encoding
    y_pred_semi = label_encoder.inverse_transform(y_pred_semi)
    y_pred_semi = label_encoder.transform(y_pred_semi)

    # Evaluate the semi-supervised model
    results["Semi-Supervised (Label Propagation)"] = {
        "Accuracy": accuracy_score(y_test, y_pred_semi),
        "Precision": precision_score(y_test, y_pred_semi, average="weighted"),
        "Recall": recall_score(y_test, y_pred_semi, average="weighted"),
        "F1-Score": f1_score(y_test, y_pred_semi, average="weighted"),
    }

# Display results\
results_df = pd.DataFrame(results).T
print(results_df)

Labeled samples: 3539, Unlabeled samples: 885
                                     Accuracy  Precision    Recall  F1-Score
Logistic Regression                  0.777935   0.758984  0.777935  0.761706
Random Forest                        0.779349   0.760285  0.779349  0.764204
Decision Tree                        0.683168   0.699140  0.683168  0.690249
Semi-Supervised (Label Propagation)  0.643564   0.634830  0.643564  0.638669


  probabilities /= normalizer
