In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Replace 'your_file.xlsx' with the path to your Excel file
file_path = 'Student-Employability-Datasets.csv'

# Read the Excel file
df = pd.read_csv(file_path)
# Firstly delete not needed column
df.drop(columns="Name of Student", inplace=True)

print(f'Shape of dataframe: {df.shape}')
# Get basic information about the DataFrame
df.head()

Shape of dataframe: (2982, 9)


Unnamed: 0,GENERAL APPEARANCE,MANNER OF SPEAKING,PHYSICAL CONDITION,MENTAL ALERTNESS,SELF-CONFIDENCE,ABILITY TO PRESENT IDEAS,COMMUNICATION SKILLS,Student Performance Rating,CLASS
0,4,5,4,5,5,5,5,5,Employable
1,4,4,4,4,4,4,3,5,Employable
2,4,3,3,3,3,3,2,5,LessEmployable
3,3,3,3,2,3,3,3,5,LessEmployable
4,4,4,3,3,4,4,3,5,Employable


In [2]:
### ALL FUNCTIONS
# Function to find rows with identical feature values but different outcomes
def find_inconsistent_rows(df_small_dataset, outcome_column):
    # Identify feature columns (all columns except the outcome column)
    feature_columns = df_small_dataset.columns.drop(outcome_column).tolist()
    
    # Group by all feature columns
    grouped = df_small_dataset.groupby(feature_columns)
    print(f'CHECK: {grouped.nunique()}')
    # Filter groups where the outcome is not unique
    inconsistent = grouped.filter(lambda x: x[outcome_column].nunique() > 1)
    
    return inconsistent


def sigmoid(x):
    return 1/(1+np.exp(-x))

# Function of logistic regression from scratch
def log_regression(X, y, learning_rate, iterations):
    m = X.shape[1]
    n = X.shape[0]
    W = np.zeros((n,1))
    B = 0
    cost_list = []
    
    for i in range(iterations):
        
        Z = np.dot(W.T, X) + B
        A = sigmoid(Z)
        
        # cost function
        cost = -(1/m)*np.sum( y*np.log(A) + (1-y)*np.log(1-A))
        
        # Gradient Descent
        dW = (1/m)*np.dot(A-y, X.T)
        dB = (1/m)*np.sum(A - y)
        
        W = W - learning_rate*dW.T
        B = B - learning_rate*dB
        
        # Keeping track of our cost function value
        cost_list.append(cost)
        
        if(i%(iterations/10) == 0):
            print("cost after ", i, "iteration is : ", cost)
        
    return W, B, cost_list


def predict(x, W, B):
    theta = np.dot(W.T, x) + B
    y_pred = sigmoid(theta)
    y_pred = y_pred > 0.5   
    return np.array(y_pred, dtype='int64')
    
    
def accuracy (y_pred, y_real):
    return ( 1- (1/y_real.shape[1])* np.sum(np.abs(y_pred - y_real)) )* 100


# Function to find and remove rows with identical feature values but different outcomes
def remove_inconsistent_rows(df, outcome_column):
    # Identify feature columns (all columns except the outcome column)
    feature_columns = df.columns.drop(outcome_column).tolist()
    
    # Group by all feature columns
    grouped = df.groupby(feature_columns)
    
    # Identify groups where the outcome is not unique
    inconsistent_groups = grouped.filter(lambda x: x[outcome_column].nunique() > 1)
    
    # Get the indices of the inconsistent rows
    inconsistent_indices = inconsistent_groups.index
    
    # Remove the inconsistent rows from the original dataframe
    df_cleaned = df.drop(inconsistent_indices)
    
    return df_cleaned, inconsistent_groups


In [3]:
# Remove inconsistent rows and get the inconsistent rows
df_cleaned, inconsistent_rows = remove_inconsistent_rows(df, 'CLASS')

In [4]:
df_cleaned.shape

(2246, 9)

In [5]:
df_cleaned.head()

Unnamed: 0,GENERAL APPEARANCE,MANNER OF SPEAKING,PHYSICAL CONDITION,MENTAL ALERTNESS,SELF-CONFIDENCE,ABILITY TO PRESENT IDEAS,COMMUNICATION SKILLS,Student Performance Rating,CLASS
0,4,5,4,5,5,5,5,5,Employable
3,3,3,3,2,3,3,3,5,LessEmployable
4,4,4,3,3,4,4,3,5,Employable
5,4,4,3,3,3,3,3,5,Employable
6,4,4,4,3,3,3,3,3,Employable


In [6]:
### SPLITTING DATA INTO X AND y
X = df_cleaned.drop('CLASS', axis=1)
y = df_cleaned['CLASS']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
### CHANGING Pd to Numpy and RESHAPING DATA
X_train_np = X_train.values.T   # I expect (8, len< 2246)
X_test_np = X_test.values.T
y_train_encoded = np.array(y_train == 'Employable', dtype='int64')  # I expect (1, len= like in X_train_np)
y_test_encoded = np.array(y_test == 'Employable', dtype='int64')
y_train_encoded = np.reshape(y_train_encoded, (1, y_train_encoded.shape[0]) )
y_test_encoded = np.reshape(y_test_encoded, (1, y_test_encoded.shape[0]) )

In [8]:
print(X_train_np.shape, X_test_np.shape, y_train_encoded.shape, y_test_encoded.shape)

(8, 1796) (8, 450) (1, 1796) (1, 450)


In [13]:
y_test.shape

(450,)

In [17]:
learn_rate = 0.05  # CAN BE MODIFIED
iterations = 100000
W, B, cost_list = log_regression(X_train_np, y_train_encoded, learn_rate, iterations)
y_pred = predict(X_test_np, W, B)
acc = accuracy(y_pred, y_test_encoded)
print(acc)

y_pred_labels = np.where(y_pred == 0, "LessEmployable", "Employable")
y_pred_labels = y_pred_labels.reshape(-1)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_labels))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_labels))

cost after  0 iteration is :  0.6931471805599453
cost after  10000 iteration is :  0.6362898512190062
cost after  20000 iteration is :  0.6357203643203906
cost after  30000 iteration is :  0.6356546354932294
cost after  40000 iteration is :  0.6356469853953173
cost after  50000 iteration is :  0.6356460921205825
cost after  60000 iteration is :  0.6356459876959938
cost after  70000 iteration is :  0.6356459754838045
cost after  80000 iteration is :  0.6356459740554249
cost after  90000 iteration is :  0.6356459738883488
58.666666666666664

Classification Report:
                precision    recall  f1-score   support

    Employable       0.62      0.72      0.67       258
LessEmployable       0.52      0.40      0.45       192

      accuracy                           0.59       450
     macro avg       0.57      0.56      0.56       450
  weighted avg       0.58      0.59      0.58       450


Confusion Matrix:
[[187  71]
 [115  77]]
