# We evaluate student performances in their studies while using decision tree classifier

**Step 1 : We inport the necessary libraries**

In [4]:
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder


**Step 2 : We load the dataset**

In [5]:
# Load the training dataset
train_data = pd.read_csv('/kaggle/input/student-study-performance/study_performance.csv')
train_data.head()


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


**Step 3 : Convert the variables (math_score , reading_score , writing_score)  to binary values**

In [6]:
#converting the classification of the target variables into a binary classification
def convert_score(score):
    if score >= 50:
        return 1
    else:
        return 0

train_data['math_score'] = train_data['math_score'].apply(convert_score)
train_data['reading_score'] = train_data['reading_score'].apply(convert_score)
train_data['writing_score'] = train_data['writing_score'].apply(convert_score)
train_data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,1,1,1
1,female,group C,some college,standard,completed,1,1,1
2,female,group B,master's degree,standard,none,1,1,1
3,male,group A,associate's degree,free/reduced,none,0,1,0
4,male,group C,some college,standard,none,1,1,1


**Step 4: convert all the categorial values to numerical **

In [7]:
#convert the categorical features into numerical features to train the model
label_encoders = {}
categorical_features= train_data.select_dtypes(include=['object']).columns.tolist()
for feature in categorical_features:
    label_encoders[feature] = LabelEncoder()
    train_data[feature] = label_encoders[feature].fit_transform(train_data[feature])
train_data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,0,1,1,1,1,1,1,1
1,0,2,4,1,0,1,1,1
2,0,1,3,1,1,1,1,1
3,1,0,0,0,1,0,1,0
4,1,2,4,1,1,1,1,1


# Decision Tree Classification 1 : "math_score"

**Step 1: train test split**

In [8]:
print("first classification : target variable is math_score ")
# Splitting the dataset into features and target variable
X = train_data.drop('math_score', axis=1)  # Features
y = train_data['math_score']  # Target variable
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

first classification : target variable is math_score 


**step 2 :apply the decision tree classifier**

Key Metrics

Given the values from a confusion matrix, we can calculate several important metrics:

    Accuracy: Measures the proportion of correctly classified cases out of the total cases.
    Accuracy=TP+TNTP+TN+FP+FNAccuracy=TP+TN+FP+FNTP+TN​

    Sensitivity (Recall): Measures the proportion of true positives among all positive cases.
    Sensitivity=TPTP+FNSensitivity=TP+FNTP​

    Specificity: Measures the proportion of true negatives among all negative cases.
    Specificity=TNTN+FPSpecificity=TN+FPTN​

    Precision: Measures the proportion of true positives among all predicted positive cases.
    Precision=TPTP+FPPrecision=TP+FPTP​

    F1-Score: Combines precision and recall to create a single metric that balances both. It is the harmonic mean of precision and recall.
    F1-Score=2×Precision×RecallPrecision+RecallF1-Score=2×Precision+RecallPrecision×Recall​

Utility of Metrics

    Accuracy is a general measure of model performance but may be misleading if the dataset is imbalanced.
    Sensitivity (Recall) is crucial in cases where you want to minimize false negatives, such as medical diagnosis.
    Specificity is important when false positives have high costs, like spam detection.
    Precision is helpful when the cost of false positives is high, such as email spam filtering.
    F1-Score balances precision and recall, suitable for imbalanced datasets.

In [23]:
# Initializing the decision tree classifier without pruning
clf = DecisionTreeClassifier()
# Fitting the classifier on the training data
dtree=clf.fit(X_train, y_train)
# Evaluating the accuracy of the model
train_pred = dtree.predict(X_train)
test_pred = dtree.predict(X_test)
print("Accuracy score of the training set:", accuracy_score(y_train, train_pred), sep="\n")
print("Accuracy score of the test set:", accuracy_score(y_test, test_pred), sep="\n")
# Confusion matrix
conf_matrix = confusion_matrix(y_test, test_pred)
print("\n Confusion Matrix:")
print(conf_matrix,'\n')

# True Positives (TP), True Negatives (TN), False Positives (FP), False Negatives (FN)
TN, FP, FN, TP = conf_matrix.ravel()
# accuracy ,Sensitivity, Specificity, and F1-score
accuracy = (TP + TN) / (TP + TN + FP + FN)
sensitivity = TP / (TP + FN) #recall
specificity = TN / (TN + FP)
precision = TP / (TP + FP)
f1_score= TP / (TP+0.5*(FP+FN))

print("True Positives (TP):", TP)
print("True Negatives (TN):", TN)
print("False Positives (FP):", FP)
print("False Negatives (FN):", FN,'\n')

print('------------- using Confusion Matrix --------------')
#----->Accuracy: Measures the proportion of correctly classified cases out of the total cases
print("Accuracy:", accuracy)

#----->Sensitivity (Recall): Measures the proportion of true positives among all positive cases.
print("Sensitivity:", sensitivity)

#----->Specificity: Measures the proportion of true negatives among all negative cases.
print("Specificity:", specificity)

#----->Precision: Measures the proportion of true positives among all predicted positive cases
print("Precision:",precision)

#----->F1-Score: Combines precision and recall to create a single metric that balances both. It is the harmonic mean of precision and recall.
print("F1-score:", f1_score)
print('\n')

print('------------- using scikit-learn --------------')
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score 
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

print("Accuracy:",accuracy_score(y_test, test_pred))
print("Sensitivity(recall):",recall_score(y_test, test_pred))
#print("Specificity:",confusion_matrix(y_test, test_pred))
print("Precision:",precision_score(y_test, test_pred))
print("F1-score:",f1_score(y_test, test_pred))

Accuracy score of the training set:
0.9528571428571428
Accuracy score of the test set:
0.8433333333333334

 Confusion Matrix:
[[ 22  29]
 [ 18 231]] 

True Positives (TP): 231
True Negatives (TN): 22
False Positives (FP): 29
False Negatives (FN): 18 

------------- using Confusion Matrix --------------
Accuracy: 0.8433333333333334
Sensitivity: 0.927710843373494
Specificity: 0.43137254901960786
Precision: 0.8884615384615384
F1-score: 0.9076620825147348


------------- using scikit-learn --------------
Accuracy: 0.8433333333333334
Sensitivity(recall): 0.927710843373494
Precision: 0.8884615384615384
F1-score: 0.9076620825147347


Interpretation

    High Accuracy suggests the model is generally performing well, but check if the dataset is imbalanced.
    High Sensitivity indicates that the model is capturing most of the positive cases.
    Low Specificity means the model is not good at rejecting negative cases.
    High Precision implies the model's positive predictions are accurate.
    High F1-Score shows a good balance between precision and recall.

# Decision Tree Classification 2 : "reading_score"

**Step 1 : train test split**

In [16]:
print("second classification : target variable is reading_score ")
# Splitting the dataset into features and target variable
X = train_data.drop('reading_score', axis=1)  # Features
y = train_data['reading_score']  # Target variable
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

second classification : target variable is reading_score 


**Step 2 : apply the decision tree classifier**

In [26]:
# Initializing the decision tree classifier without pruning
clf = DecisionTreeClassifier()
# Fitting the classifier on the training data
dtree=clf.fit(X_train, y_train)
# Evaluating the accuracy of the model
train_pred = dtree.predict(X_train)
test_pred = dtree.predict(X_test)
print("Accuracy score of the training set:", accuracy_score(y_train, train_pred), sep="\n")
print("Accuracy score of the test set:", accuracy_score(y_test, test_pred), sep="\n")
# Confusion matrix
conf_matrix = confusion_matrix(y_test, test_pred)
print("\n Confusion Matrix:")
print(conf_matrix,'\n')
# True Positives (TP), True Negatives (TN), False Positives (FP), False Negatives (FN)
TN, FP, FN, TP = conf_matrix.ravel()
# accuracy ,Sensitivity, Specificity, and F1-score
accuracy = (TP + TN) / (TP + TN + FP + FN)
sensitivity = TP / (TP + FN)
specificity = TN / (TN + FP)
precision = TP / (TP + FP)
recall = sensitivity
f1_score = 2 * (precision * recall) / (precision + recall)

print("True Positives (TP):", TP)
print("True Negatives (TN):", TN)
print("False Positives (FP):", FP)
print("False Negatives (FN):", FN,'\n')

print('------------- using Confusion Matrix --------------')
#----->Accuracy: Measures the proportion of correctly classified cases out of the total cases
print("Accuracy:", accuracy)

#----->Sensitivity (Recall): Measures the proportion of true positives among all positive cases.
print("Sensitivity:", sensitivity)

#----->Specificity: Measures the proportion of true negatives among all negative cases.
print("Specificity:", specificity)

#----->Precision: Measures the proportion of true positives among all predicted positive cases
print("Precision:",precision)

#----->F1-Score: Combines precision and recall to create a single metric that balances both. It is the harmonic mean of precision and recall.
print("F1-score:", f1_score)
print('\n')

print('------------- using scikit-learn --------------')
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score 
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

print("Accuracy:",accuracy_score(y_test, test_pred))
print("Sensitivity(recall):",recall_score(y_test, test_pred))
#print("Specificity:",confusion_matrix(y_test, test_pred))
print("Precision:",precision_score(y_test, test_pred))
print("F1-score:",f1_score(y_test, test_pred))

Accuracy score of the training set:
0.9528571428571428
Accuracy score of the test set:
0.8466666666666667

 Confusion Matrix:
[[ 23  28]
 [ 18 231]] 

True Positives (TP): 231
True Negatives (TN): 23
False Positives (FP): 28
False Negatives (FN): 18 

------------- using Confusion Matrix --------------
Accuracy: 0.8466666666666667
Sensitivity: 0.927710843373494
Specificity: 0.45098039215686275
Precision: 0.8918918918918919
F1-score: 0.9094488188976378


------------- using scikit-learn --------------
Accuracy: 0.8466666666666667
Sensitivity(recall): 0.927710843373494
Precision: 0.8918918918918919
F1-score: 0.9094488188976378
