# 2. Exploratory Data Analysis (EDA)

This notebook covers the content for the "Visão Geral" (Overview) and "Exploração Geral (AED)" (General Exploration) tabs of the portfolio.

In [None]:
# If we saved the cleaned data, we could load it here instead:
# df = pd.read_parquet('cleaned_student_data.parquet')

## Visão Geral (Overview)

### Estatísticas Chave do Dataset

In [None]:
num_records = len(df)
num_variables = len(df.columns)
avg_exam_score = df['exam_score'].mean()

print(f"Registros de Estudantes: {num_records}")
print(f"Variáveis Analisadas: {num_variables}") # Includes student_id
print(f"Média (Simulada) de Notas: {avg_exam_score:.1f}")

### Distribuição das Notas dos Exames (`exam_score`)

In [None]:
plt.figure(figsize=(10, 6))
# Using the helper function for bins to somewhat match the HTML, or use seaborn's default
# bin_edges, bin_labels = calculate_histogram_bins_edges(df['exam_score'])
# sns.histplot(df['exam_score'], bins=bin_edges, kde=False)

# Simpler approach with seaborn, 'sqrt' rule is often a good default
num_bins_sqrt = int(np.ceil(np.sqrt(len(df['exam_score']))))
sns.histplot(df['exam_score'], bins=num_bins_sqrt, kde=True, color='teal', alpha=0.6)

plt.title('Distribuição das Notas dos Exames (exam_score)')
plt.xlabel('Nota do Exame')
plt.ylabel('Número de Estudantes')
plt.show()

## Exploração Geral (AED)

### Matriz de Correlação (Valores Calculados)

In [None]:
numerical_cols_for_corr = df.select_dtypes(include=np.number).columns.tolist()
if 'student_id' in numerical_cols_for_corr: # Should not happen if student_id is object
    numerical_cols_for_corr.remove('student_id')
    
correlation_matrix = df[numerical_cols_for_corr].corr()
exam_score_corr = correlation_matrix['exam_score'].sort_values(ascending=False)

print("Correlação das variáveis numéricas com 'exam_score':")
print(exam_score_corr.drop('exam_score')) # Drop self-correlation

# For display similar to HTML table (using a subset of features from HTML for comparison)
html_corr_features = [
    'age', 'study_hours_per_day', 'social_media_hours', 
    'netflix_hours', 'attendance_percentage', 'sleep_hours', 
    'exercise_frequency', 'mental_health_rating'
]
print("\nCalculated Correlations for features mentioned in HTML:")
for feature in html_corr_features:
    if feature in exam_score_corr:
        print(f"{feature}: {exam_score_corr[feature]:.2f}")
    else:
        print(f"{feature}: Not directly numerical or not found")

*Note: The HTML shows hardcoded correlation values. The values calculated above are from the actual dataset and may differ. The HTML seems to use Pearson correlation.*

### Distribuição de Variáveis Preditivas (Exemplos)

In [None]:
predictor_vars_to_plot = {
    'study_hours_per_day': 'numerical',
    'social_media_hours': 'numerical',
    'sleep_hours': 'numerical',
    'gender': 'categorical',
    'diet_quality': 'categorical' # This is ordered categorical
}

for var, var_type in predictor_vars_to_plot.items():
    plt.figure(figsize=(8, 5))
    if var_type == 'numerical':
        num_bins = int(np.ceil(np.sqrt(len(df[var].dropna()))))
        sns.histplot(df[var], bins=num_bins, kde=False, color='teal', alpha=0.7)
        plt.title(f'Distribuição de {var.replace("_", " ").title()}')
        plt.xlabel(var.replace("_", " ").title())
        plt.ylabel('Contagem')
    elif var_type == 'categorical':
        # For ordered categoricals, ensure the order is respected in the plot
        order = None
        if pd.api.types.is_categorical_dtype(df[var]) and df[var].cat.ordered:
            order = df[var].cat.categories
        sns.countplot(x=var, data=df, order=order, palette='viridis', alpha=0.8)
        plt.title(f'Distribuição de {var.replace("_", " ").title()}')
        plt.xlabel(var.replace("_", " ").title())
        plt.ylabel('Contagem')
        plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()