In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')


df = pd.read_csv("../student_cv\StudentsPerformance.csv")

# Assuming 'df' is loaded from "StudentsPerformance.xlsx - Sheet1.csv"
# 1. Calculate the average scores grouped by 'test preparation course'
df_grouped = df.groupby('test preparation course')[
    ['math score', 'reading score', 'writing score']
].mean().reset_index()

df_melted = df_grouped.melt(
    id_vars='test preparation course', 
    var_name='Subject', 
    value_name='Average Score'
)

<p style="font-weight: 900;">Question 1:</p>
<style> h3 {text-align: left;color: yellow;} </style>
<h3>How does the average score compare between students who completed the test preparation course versus those who did not?</h3>

In [None]:
#Grouped Bar Chart
plt.figure(figsize=(10, 6))
sns.barplot(
    data=df_melted, 
    x='Subject', 
    y='Average Score', 
    hue='test preparation course', 
    palette='coolwarm'
)

plt.title('Average Scores by Test Preparation Course Completion Status', fontsize=16)
plt.xlabel('Subject', fontsize=14)
plt.ylabel('Average Score', fontsize=14)
plt.legend(title='Test Course', loc='lower right')
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show() # In a notebook environment

<p><b>Insights:</b><br>
Based on this visualization, it shows that students who completed the test preparation course consistently achieved higher average scores across all three subjects, with the difference being most pronounced in writing and reading scores, where completers scored approximately 9 to 10 points higher than non-completers. This suggests that completing the test preparation course has a significant positive impact on a student's overall test performance.</p>

<p style="font-weight: 900;">Question 2:</p>
<style> h3 {text-align: left;color: yellow;} </style>
<h3>What is the distribution of overall academic performance across the different parental levels of education?</h3>

In [None]:
df['Total Score'] = df['math score'] + df['reading score'] + df['writing score']

education_order = [
    "some high school",
    "high school",
    "some college",
    "associate's degree",
    "bachelor's degree",
    "master's degree"
]

#Box Plot
plt.figure(figsize=(12, 7))
sns.boxplot(
    data=df, 
    x='parental level of education', 
    y='Total Score', 
    order=education_order, 
    palette='Pastel1'
)

plt.title('Distribution of Total Scores by Parental Education Level', fontsize=16)
plt.xlabel('Parental Level of Education', fontsize=14)
plt.ylabel('Total Score (Math + Reading + Writing)', fontsize=14)
plt.xticks(rotation=45, ha='right') 
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show() # In a notebook environment

<p><b>Insights:</b><br>
Based on this visualization, it shows a clear positive correlation between a parent's highest education level and a student's total score. As the parental education level increases from "some high school" to "master's degree," the median total score consistently rises, suggesting that the educational background of parents is a strong predictor of student performance.</p>

<p style="font-weight: 900;">Question 3:</p>
<style> h3 {text-align: left;color: yellow;} </style>
<h3>Is there a correlation between a student's math score and their writing score?</h3>

In [None]:
#Calculate the Pearson Correlation Coefficient
correlation_r = df['math score'].corr(df['writing score'])

#Scatter Plot with a Trendline 
plt.figure(figsize=(8, 6))
sns.regplot(
    data=df, 
    x='math score', 
    y='writing score', 
    scatter_kws={'alpha':0.5, 's':20}, 
    line_kws={'color':'red', 'linestyle':'--'} 
)

plt.title(f'Correlation between Math Score and Writing Score (R = {correlation_r:.2f})', fontsize=14)
plt.xlabel('Math Score', fontsize=12)
plt.ylabel('Writing Score', fontsize=12)
plt.grid(True, linestyle=':', alpha=0.6)
plt.tight_layout()
plt.show() # In a notebook environment

<p><b>Insights:</b><br>
Based on this visualization, it shows that the visualization confirms a very strong positive linear correlation R=0.80 between mathematics scores and writing scores. The data points cluster tightly around the upward-sloping red trendline, indicating that students who score high in mathematics are highly likely to score high in writing, suggesting a strong dependency on a common underlying aptitude.</p>

<p style="font-weight: 900;">Question 4:</p>
<style> h3 {text-align: left;color: yellow;} </style>
<h3>What is the gender breakdown of students within each race/ethnicity group?</h3>

In [None]:
gender_palette = {'female': '#FF69B4', 'male': '#00BFFF'} 

#Stacked Bar Chart 
plt.figure(figsize=(10, 6))
sns.histplot(
    data=df, 
    x='race/ethnicity', 
    hue='gender', 
    multiple='stack', # Key for stacked bar chart
    palette=gender_palette,
    shrink=0.8
)

plt.title('Gender Breakdown of Students within each Race/Ethnicity Group', fontsize=16)
plt.xlabel('Race/Ethnicity Group', fontsize=14)
plt.ylabel('Number of Students', fontsize=14)
plt.tight_layout()
plt.show() # In a notebook environment

<p><b>Insights:</b><br>
Based on this visualization, it shows that Group C is the largest demographic, containing the highest total number of students, and while the gender split is generally balanced across most groups, Group C and Group B show a notably higher number of female students, whereas other groups (A, D, and E) are more closely balanced or show a slight male majority.</p>

<p style="font-weight: 900;">Question 5:</p>
<style> h3 {text-align: left;color: yellow;} </style>
<h3>How do the average scores across all three subjects (math, reading, and writing) compare among the different race/ethnicity groups?</h3>

In [None]:
df_grouped_race = df.groupby('race/ethnicity')[
    ['math score', 'reading score', 'writing score']
].mean().reset_index()

#Melt the DataFrame from wide to long format for plotting
df_melted_race = df_grouped_race.melt(
    id_vars='race/ethnicity', 
    var_name='Subject', 
    value_name='Average Score'
)

#Grouped Bar Chart
plt.figure(figsize=(12, 7))
sns.barplot(
    data=df_melted_race, 
    x='Subject', 
    y='Average Score', 
    hue='race/ethnicity', 
    palette='Spectral'
)

plt.title('Average Subject Scores by Race/Ethnicity Group', fontsize=16)
plt.xlabel('Subject', fontsize=14)
plt.ylabel('Average Score', fontsize=14)

# Place the legend outside the plot area
plt.legend(title='Race/Ethnicity', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(axis='y', linestyle='--')
plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show() # In a notebook environment

<p><b>Insights:</b><br>
Based on this visualization, it shows that the Grouped Bar Chart clearly reveals a significant disparity in average scores across the different race/ethnicity groups, with Group E consistently achieving the highest average scores in all three subjects (particularly in math) and Group A recording the lowest averages. Furthermore, the overall pattern shows that average reading and writing scores are generally higher than math scores across every group.</p>

<b>Question 6:</b>
<style> h3 {text-align: left;color: yellow;} </style>
<h3>khk</h3>

<b>Question 7:</b>
<style> h3 {text-align: left;color: yellow;} </style>
<h3>H</h3>

<b>Question 8:</b>
<style> h3 {text-align: left;color: yellow;} </style>
<h3>?</h3>

<b>Question 9:</b>
<style> h3 {text-align: left;color: yellow;} </style>
<h3>Hs?</h3>

<b>Question 10:</b>
<style> h3 {text-align: left;color: yellow;} </style>
<h3>Hps?</h3>