In [None]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Load Data 

In [None]:
# Load the data from an Excel file
df = pd.read_excel("Grade_CS_Students.xlsx", na_values=['NA'])

# Display the first few rows of the dataframe
df.head()

#### stats

In [None]:
# display descriptive statistics for the DataFrame
print("Descriptive statistics:")
print(df.describe())

In [None]:
# Display the number of null values in each column
print("Number of null values in each column:")
print(df.isnull().sum())

#### #of Nulls

In [None]:
# Count Non-Numeric Values
# Create a dictionary to store the count of non-numeric values
non_numeric_counts = {}

# Loop through each column to count non-numeric values
for col in df.columns:
    # Convert to numeric, set non-numeric to NaN
    non_numeric_col = pd.to_numeric(df[col], errors='coerce')
    # Count NaN values (which represent non-numeric values)
    non_numeric_counts[col] = non_numeric_col.isna().sum()

# Convert the dictionary to a DataFrame for better readability
non_numeric_counts_df = pd.DataFrame(list(non_numeric_counts.items()), columns=['Column', 'Non-Numeric Count'])

# Display the counts
print(non_numeric_counts_df)

#### drop un necessary colms

In [None]:
# drop unnecessary columns
df = df.drop(['Year of enrolment', 'ID'], axis=1)

df.head()

#### divide dataset to features & targets

In [None]:
# divide features and targets
target_subjects = ['CS501','CS502','CS503','CS504','CS505','CS506','CS507','CS508','CS509','CS510','CS512','CS597','CS598','MM507'] 
features = df.drop(target_subjects, axis=1)
targets = df[target_subjects]

# temp
target = df['CS501']

print("features : ", features.shape, "\ntargets : ", target.shape)





#### distribution of each feature 

In [None]:
# plot the box plot for each column in features
plt.figure(figsize=(20, 10))
sns.boxplot(data=features)
plt.xticks(rotation=90)
plt.show()


# Pre Processesing

#### replace NULLs

In [None]:
# replace null values

for col in features.columns:
    features[col] = pd.to_numeric(features[col], errors='coerce')  # Convert to numeric, set non-numeric to NaN
    features.fillna({col: features[col].median()}, inplace=True)  # Fill NaN with median of the column

# Verify the changes
features.head()


In [None]:
print(features.isnull().sum())

#### encode values

In [None]:
def encode_grade(marks):

# Convert scores into grades

    if marks > 85:
        return 0  #'A+'
    elif 80 <= marks <= 85:
        return 1  #'A'
    elif 75 <= marks < 80:
        return 2  #'A-'
    elif 70 <= marks < 75:
        return 3  #'B+'
    elif 65 <= marks < 70:
        return 4  #'B'
    elif 60 <= marks < 65:
        return 5  #'B-'
    elif 55 <= marks < 60:
        return 6  #'C+'
    elif 50 <= marks < 55:
        return 7  #'C'
    elif 45 <= marks < 50:
        return 8  #'C-'
    elif 40 <= marks < 45:
        return 9 #'D+'
    elif 35 <= marks < 40:
        return 10 #'D'
    else:
        return 11 #'E'


# Apply the grade encoding function to each cell in the dataframe
features_encoded = features.map(encode_grade)


features_encoded.head()

In [None]:
# Encode targets
targets_encoded = targets.map(encode_grade)

targets_encoded.head()

#### Correlation between features

In [None]:
# plot correlation matrix for encoded features use blue for negative correlation and red for positive correlation
plt.figure(figsize=(20, 10))
sns.heatmap(features_encoded.corr(), annot=True, cmap='coolwarm')
plt.show()


#### correlation between features and targets

In [None]:
# List to store correlation data for each target
correlations = {}

# Iterate over each column in targets_encoded
for column in targets_encoded.columns:
    correlation = features_encoded.corrwith(targets_encoded[column])
    correlation.sort_values(inplace=True)
    correlations[column] = correlation

    # Plot the correlation between features and the current target
    plt.figure(figsize=(10, 5))
    correlation.plot(kind='bar', color='blue')
    plt.title(f'Correlation between features and target: {column}')
    plt.show()

# Plot min and max correlations for each target
min_correlations = {target: corr.min() for target, corr in correlations.items()}
max_correlations = {target: corr.max() for target, corr in correlations.items()}

# Convert to DataFrame for plotting
min_corr_df = pd.DataFrame(list(min_correlations.items()), columns=['Target', 'Min Correlation'])
max_corr_df = pd.DataFrame(list(max_correlations.items()), columns=['Target', 'Max Correlation'])

# Plot min correlations
plt.figure(figsize=(10, 5))
min_corr_df.set_index('Target')['Min Correlation'].plot(kind='bar', color='red')
plt.title('Minimum Correlation for Each Target')
plt.show()

# Plot max correlations
plt.figure(figsize=(10, 5))
max_corr_df.set_index('Target')['Max Correlation'].plot(kind='bar', color='green')
plt.title('Maximum Correlation for Each Target')
plt.show()

#### feature(encoded) distribution

In [None]:
# plot subplot bar charts for each feature
plt.figure(figsize=(40, 30))
for i, col in enumerate(features.columns):
    plt.subplot(4, 13, i + 1)
    plt.bar(features_encoded[col].value_counts().index, features_encoded[col].value_counts().values)
    plt.title(col)
    plt.xlabel('Grade')
    plt.ylabel('Count')


#### balancing data set

##### visualization

In [None]:
# Iterate over each column in targets_encoded
for column in targets_encoded.columns:
    value_counts = targets_encoded[column].value_counts()
    
    # Plot the value counts as a bar chart
    plt.figure(figsize=(10, 5))
    value_counts.plot(kind='bar', color='blue')
    plt.title(f'Value Counts for Target: {column}')
    plt.xlabel('Value')
    plt.ylabel('Count')
    plt.show()
    
    # Print the value counts
    print(f'Value Counts for Target: {column}')
    print(value_counts)
    print('\n')

#### combine smaller classes to form classes with more than 4 samples 

In [None]:
# Function to merge small classes into the next larger class
def merge_small_classes(target_column, threshold=4):
    value_counts = target_column.value_counts()
    small_classes = value_counts[value_counts < threshold].index
    for small_class in small_classes:
        # Find the next larger class
        larger_classes = value_counts[value_counts >= threshold].index
        if len(larger_classes) > 0:
            next_larger_class = larger_classes[0]
            target_column[target_column == small_class] = next_larger_class
        else:
            # If no larger class exists, keep the class as is
            continue
    return target_column

# Iterate over each column in targets_encoded
for column in targets_encoded.columns:
    # Merge small classes in the target column
    targets_encoded[column] = merge_small_classes(targets_encoded[column])
    
    # Print the value counts after merging small classes
    value_counts = targets_encoded[column].value_counts()
    print(f'Value Counts for Target (after merging small classes): {column}')
    print(value_counts)
    print('\n')


### Apply smote Oversampling to balance dataset

In [None]:
def apply_smote(features, target, k_neighbors=3):
    smote = SMOTE(k_neighbors=k_neighbors)
    features_resampled, target_resampled = smote.fit_resample(features, target)
    return features_resampled, target_resampled

In [None]:
# Apply SMOTE to each target column
features_resampled_dict = {}
targets_resampled_dict = {}

for column in targets_encoded.columns:
    features_resampled, target_resampled = apply_smote(features_encoded, targets_encoded[column])
    features_resampled_dict[column] = (features_resampled, target_resampled)
    targets_resampled_dict[column] = target_resampled

    # Display the resampled target value counts
    print(f'Resampled target value counts for {column}:')
    print(pd.Series(target_resampled).value_counts())    

# <font color='red'>TRAIN + PREDICT + EVALUATE</font> 

In [None]:
def train_model_and_evaluate_for_each_target(features_resampled_dict, targets_resampled_dict):
    results = {}

    # Iterate over each resampled dataset
    for column, (features_resampled, target_resampled) in features_resampled_dict.items():
        

        #----------------------------DEFINE TRAIN TEST SET-------------------------------------
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(features_resampled, target_resampled, test_size=0.2, random_state=42)

        ############################ change according to the model ############################
        #----------------------------LABEL RE-ENCODE (OPTIONAL)--------------------------------
        # Encode target labels if they are not numeric
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_test = le.transform(y_test)  # Use transform for consistency
        
        ############################ change according to the model ############################
        #-----------------------------CREATE THE MODEL----------------------------------------- 

        model = XGBClassifier()
        model_name = 'XG Boost'

        ############################ change according to the model ############################
        #-----------------------------TRAIN FOR TRAIN SET-------------------------------------- 
        # Train the classifier
        print(f"Training {model_name} for target: {column}")
        model.fit(X_train, y_train)

        ############################ change according to the model ############################
        #-----------------------------PREDICT FOR TEST SET------------------------------------- 
        # Predict the target values
        y_pred = model.predict(X_test)
        

        #-----------------------------EVALUATION-----------------------------------------------
        result = {}

        # Calculate the accuracy, precision, recall, and F1 score of the classifier
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        result['accuracy'] = accuracy
        result['precision'] = precision
        result['recall'] = recall
        result['f1'] = f1

        # Display the precision, recall, and F1 score
        print(f'Accuracy: {accuracy}')
        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'F1 Score: {f1}')

        # add result for each target
        results[column] = result

        # Calculate the confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)

        # Display the confusion matrix
        print('Confusion Matrix:')
        print(conf_matrix)
        print()

    return results

In [None]:
# Train models for each resampled target


results = train_model_and_evaluate_for_each_target(features_resampled_dict, targets_resampled_dict)
print("Results:")
print(results)

# Results

In [None]:
# Extract data for plotting
subjects = list(results.keys())
accuracies = [results[subject]['accuracy'] for subject in subjects]
precisions = [results[subject]['precision'] for subject in subjects]
recalls = [results[subject]['recall'] for subject in subjects]
f1_scores = [results[subject]['f1'] for subject in subjects]

# Plotting the results
x = range(len(subjects))

fig, ax = plt.subplots(figsize=(12, 8))

bar_width = 0.2

bars1 = plt.bar(x, accuracies, bar_width, label='Accuracy')
bars2 = plt.bar([i + bar_width for i in x], precisions, bar_width, label='Precision')
bars3 = plt.bar([i + 2 * bar_width for i in x], recalls, bar_width, label='Recall')
bars4 = plt.bar([i + 3 * bar_width for i in x], f1_scores, bar_width, label='F1 Score')

# Add values on top of the bars
def add_values(bars):
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2.0, yval, f'{yval:.2f}', ha='center', va='bottom', rotation=90)

add_values(bars1)
add_values(bars2)
add_values(bars3)
add_values(bars4)

plt.xlabel('Subjects')
plt.ylabel('Scores')
plt.title('Evaluation Metrics(For Test Set) for Each Subject')
plt.xticks([i + 1.5 * bar_width for i in x], subjects)
plt.legend()

plt.tight_layout()
plt.show()