In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('Student Performance.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
duplicates = df[df.duplicated()]

In [None]:
duplicates

In [None]:
numerical_columns = ['StudyTimeWeekly', 'Absences','GPA']

In [None]:
count_columns = ['Absences','Age', 'Gender', 'Ethnicity', 'ParentalEducation','Tutoring', 'ParentalSupport',
       'Extracurricular', 'Sports', 'Music', 'Volunteering']

In [None]:
df = df.drop(columns = ['StudentID'])

In [None]:
df

In [None]:
for columns in count_columns:
    plt.figure(figsize=(15, 7))
    sns.countplot(data=df, x=columns)
    plt.title(f'Countplot of {columns}')
    plt.show()

In [None]:
# Histplots for numerical data
for columns in numerical_columns:
    plt.figure(figsize=(15, 8))
    sns.histplot(data=df, x=columns,kde = True, color= 'skyblue' )
    plt.title(f'Histplot of {columns}')
    plt.show()

In [None]:
corr = df.corr()
plt.figure(figsize=(20, 15))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
X = df.drop(columns = ['GradeClass'])

In [None]:
Y = df['GradeClass']

In [None]:
X

In [None]:
Y

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
standard_scaler = StandardScaler()

In [None]:
standard_scaler.fit(X)

In [None]:
scaled_inputs = standard_scaler.transform(X)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, Y, test_size = 0.2, random_state = 40)

In [None]:
models = [
    ('Random Forest' , RandomForestClassifier(random_state= 42)),
    ('Gradient Boosting Classifier' , GradientBoostingClassifier(random_state= 42)),
    ('Support Vector Machines' , SVC(random_state= 42)),
    ('Logistic Regression' , LogisticRegression(random_state= 42)),
    ('K Nearest Neighbors ' , KNeighborsClassifier()),
    ('Decision Tree Classifier' , DecisionTreeClassifier(random_state= 42)),
    ('Ada Boost Classifier' , AdaBoostClassifier(random_state= 42)),
    ('Guassian Naive Bayes' , GaussianNB())
    
]

best_model = None
best_accuracy = 0.0
accuracies = []
model_names = []
for name , model in models:
    pipeline = Pipeline([
        ('model' , model)
    ])
    
    # Perform the cross validation
    scores = cross_val_score(pipeline , x_train , y_train , cv = 5)

    # Calculate the mean accuracy
    mean_accuracy = scores.mean()

    # Fit the pipelilne on the training data
    pipeline.fit(x_train , y_train)

    # Make presictions on the test data
    y_pred = pipeline.predict(x_test)

    # Calculating the accuracy Scores
    accuracy = accuracy_score(y_test , y_pred)
    accuracies.append(accuracy)
    model_names.append(name)
    # Displaying the performance Metrics
    
    print("Model : " , name)
    print("Cross Validation Score : " , mean_accuracy)
    print("Test Accuracy : " , accuracy)
    print()

    # Checking if the current model had the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

# Retrieve the best model





In [None]:
print("Best Model : ", best_model)
plt.figure(figsize=(10, 6))
plt.bar(model_names, accuracies, color='skyblue')
plt.xlabel('Models')
plt.ylabel('Test Accuracy')
plt.title('Test Accuracy of Different Models')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
"""prediction = models[1][1]
training_data = pd.DataFrame([
    [17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196],
    [18,0,0,1,15.40875606,0,0,1,0,0,0,0,3.042914833]
    # Add more training data here
], columns=['Age', 'Gender', 'Ethnicity', 'ParentalEducation', 'StudyTimeWeekly',
       'Absences', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports',
       'Music', 'Volunteering', 'GPA'])
  #  [18,0,0,1,15.40875606,0,0,1,3.042914833,0],
    #[16,1,0,1,18.44446636,0,0,3,3.57347421,0]]
#standard_scaler.fit(data_to_predict)
#standard_scaler.fit(training_data)
data_to_predict = [
    [17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196],
    [18,0,0,1,15.40875606,0,0,1,0,0,0,0,3.042914833]
]

# Convert data to DataFrame with the same columns as training data
df_to_predict = pd.DataFrame(data_to_predict, columns=training_data.columns)

# Transform data using the fitted StandardScaler
data_to_predict_scaled = standard_scaler.transform(df_to_predict)

# Make predictions
predictions = prediction.predict(data_to_predict_scaled)


# Display predictions
for prediction in predictions:
    print(prediction)"""

In [None]:
y_test.unique()

In [None]:
y_pred

In [None]:
data = {
    'Predicted Output': y_pred,
    'Actual Output': y_test
}

df1 = pd.DataFrame(data)

In [None]:
df1

In [None]:
df_sorted = df1.sort_index()
df_sorted

In [None]:
df_sorted.describe()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.countplot(x='Predicted Output', data=df_sorted)
plt.title('Count of Predicted Output')

# Create a count plot for Actual Output
plt.subplot(1, 2, 2)
sns.countplot(x='Actual Output', data=df_sorted)
plt.title('Count of Actual Output')

# Show the plot
plt.tight_layout()
plt.show()