In [16]:
import pandas as pd

In [17]:
# Load the dataset (update the path if needed)
data = pd.read_csv('student-por1.csv', sep=';') 

In [18]:
# Get the list of attributes
attributes = data.columns

# Classify attributes
attribute_info = {
    'Attribute': [],
    'Type': [],
    'Unique Values': []
}

for attribute in attributes:
    attr_type = data[attribute].dtype
    unique_values = data[attribute].unique() if attr_type == 'object' else None
    
    attribute_info['Attribute'].append(attribute)
    attribute_info['Type'].append(attr_type)
    attribute_info['Unique Values'].append(unique_values)

# Create a DataFrame for better visualization
attribute_df = pd.DataFrame(attribute_info)

attribute_df


Unnamed: 0,Attribute,Type,Unique Values
0,school,object,"[GP, MS]"
1,sex,object,"[F, M]"
2,age,int64,
3,address,object,"[U, R]"
4,famsize,object,"[GT3, LE3]"
5,Pstatus,object,"[A, T]"
6,Medu,int64,
7,Fedu,int64,
8,Mjob,object,"[at_home, health, other, services, teacher]"
9,Fjob,object,"[teacher, other, services, health, at_home]"


In [19]:
# Calculate aggregate score
data['Aggregate Score'] = data[['G1', 'G2', 'G3']].sum(axis=1)

# Order dataset by the highest aggregate score
ordered_data = data.sort_values(by='Aggregate Score', ascending=False)

# Save the ordered dataset
ordered_data.to_csv('ordered_student_performance.csv', index=False)

In [20]:
categorical_vars = data.select_dtypes(include=['object']).columns

# Create a DataFrame to store unique values for categorical variables
categorical_info = {
    'Categorical Variable': [],
    'Unique Values': []
}

for var in categorical_vars:
    categorical_info['Categorical Variable'].append(var)
    categorical_info['Unique Values'].append(data[var].unique())

categorical_df = pd.DataFrame(categorical_info)
categorical_df['Unique Values'] = categorical_df['Unique Values'].apply(lambda x: ', '.join(map(str, x)))
categorical_df


Unnamed: 0,Categorical Variable,Unique Values
0,school,"GP, MS"
1,sex,"F, M"
2,address,"U, R"
3,famsize,"GT3, LE3"
4,Pstatus,"A, T"
5,Mjob,"at_home, health, other, services, teacher"
6,Fjob,"teacher, other, services, health, at_home"
7,reason,"course, other, home, reputation"
8,guardian,"mother, father, other"
9,schoolsup,"yes, no"


One-hot encoding transforms categorical variables into multiple binary columns. Each category is represented as a separate column, with a value of 1 if the category is present in that row and 0 otherwise. The number of new columns created equals the number of unique categories in the original column.

In [21]:
# Check for missing values
missing_values = data.isnull().sum().reset_index()
missing_values.columns = ['Attribute', 'Missing Values']

missing_values


Unnamed: 0,Attribute,Missing Values
0,school,0
1,sex,0
2,age,0
3,address,0
4,famsize,0
5,Pstatus,0
6,Medu,0
7,Fedu,0
8,Mjob,0
9,Fjob,0


In [22]:
# Select numerical attributes
numerical_vars = data.select_dtypes(include=['float64', 'int64']).columns

# Calculate correlation matrix
correlation_matrix = data[numerical_vars].corr()

# Display the correlation matrix
correlation_df = correlation_matrix.reset_index()

correlation_df

Unnamed: 0,index,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,Aggregate Score
0,age,1.0,-0.107832,-0.12105,0.03449,-0.008415,0.319968,-0.020559,-0.00491,0.112805,0.134768,0.086357,-0.00875,0.149998,-0.174322,-0.107119,-0.106505,-0.133499
1,Medu,-0.107832,1.0,0.647477,-0.265079,0.097006,-0.17221,0.024421,-0.019686,0.009536,-0.007018,-0.019766,0.004614,-0.008577,0.260472,0.264035,0.240151,0.265905
2,Fedu,-0.12105,0.647477,1.0,-0.208288,0.0504,-0.165915,0.020256,0.006841,0.02769,6.1e-05,0.038445,0.04491,0.029859,0.217501,0.225139,0.2118,0.227918
3,traveltime,0.03449,-0.265079,-0.208288,1.0,-0.063154,0.09773,-0.009521,0.000937,0.057454,0.092824,0.057007,-0.048261,-0.008149,-0.15412,-0.154489,-0.127173,-0.151066
4,studytime,-0.008415,0.097006,0.0504,-0.063154,1.0,-0.147441,-0.004127,-0.068829,-0.075442,-0.137585,-0.214925,-0.056433,-0.118389,0.260875,0.240498,0.249789,0.26163
5,failures,0.319968,-0.17221,-0.165915,0.09773,-0.147441,1.0,-0.062645,0.108995,0.045078,0.105949,0.082266,0.035588,0.122779,-0.38421,-0.385782,-0.393316,-0.405815
6,famrel,-0.020559,0.024421,0.020256,-0.009521,-0.004127,-0.062645,1.0,0.129216,0.089707,-0.075767,-0.093511,0.109559,-0.089534,0.048795,0.089588,0.063361,0.07055
7,freetime,-0.00491,-0.019686,0.006841,0.000937,-0.068829,0.108995,0.129216,1.0,0.346352,0.109904,0.120244,0.084526,-0.018716,-0.094497,-0.106678,-0.122705,-0.113723
8,goout,0.112805,0.009536,0.02769,0.057454,-0.075442,0.045078,0.089707,0.346352,1.0,0.245126,0.38868,-0.015741,0.085374,-0.074053,-0.079469,-0.087641,-0.084467
9,Dalc,0.134768,-0.007018,6.1e-05,0.092824,-0.137585,0.105949,-0.075767,0.109904,0.245126,1.0,0.616561,0.059067,0.172952,-0.195171,-0.18948,-0.204719,-0.205792
