In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [12]:
df = pd.read_csv("student_data.csv")
df.head().T

Unnamed: 0,0,1,2,3,4
school,GP,GP,GP,GP,GP
sex,F,F,F,F,F
age,18,17,15,15,16
address,U,U,U,U,U
famsize,GT3,GT3,LE3,GT3,GT3
Pstatus,A,T,T,T,T
Medu,4,1,1,4,3
Fedu,4,1,1,2,3
Mjob,at_home,at_home,at_home,health,other
Fjob,teacher,other,other,services,other


In [3]:
print("Data Shape:", df.shape)
print("Data Length:", len(df))
print("\n")
print("Data Types:\n", df.dtypes)
print("\n")
print("Data Description:\n", df.describe())


Data Shape: (395, 33)
Data Length: 395


Data Types:
 school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
G1             int64
G2             int64
G3             int64
dtype: object


Data Description:
               age        Medu        Fedu  traveltime   studytime    failures  \
count  395.000000  395.000000  395.000000  395.000000  395.000000  395.000000   
mean    16.696203    2.749367    2.521519    1.448101   

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

<h1>Data Cleaning</h1>

checking missing values

In [5]:
# Checking missing values
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64


In [6]:
zero_values = (df == 0).sum()
print("\nZero Values before cleaning:\n", zero_values)


Zero Values before cleaning:
 school          0
sex             0
age             0
address         0
famsize         0
Pstatus         0
Medu            3
Fedu            2
Mjob            0
Fjob            0
reason          0
guardian        0
traveltime      0
studytime       0
failures      312
schoolsup       0
famsup          0
paid            0
activities      0
nursery         0
higher          0
internet        0
romantic        0
famrel          0
freetime        0
goout           0
Dalc            0
Walc            0
health          0
absences      115
G1              0
G2             13
G3             38
dtype: int64


In [7]:
#Numeric columns with zeros
numeric_columns_with_zeros = ['Medu', 'Fedu', 'failures', 'absences', 'G2', 'G3']

# Replace zero values with mean in numeric columns
for column in numeric_columns_with_zeros:
    if (df[column] == 0).any():
        mean_value = df[column].mean()
        df[column] = df[column].replace(0, mean_value)

# Display zero values after cleaning 
print("\nZero values in all columns after cleaning:")
zero_values_after_all = (df == 0).sum()
print(zero_values_after_all)


Zero values in all columns after cleaning:
school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64


In [8]:
# Check for duplicate rows
duplicate_rows = df.duplicated()
print(f"Number of duplicate rows: {duplicate_rows.sum()}")

Number of duplicate rows: 0


In [9]:
# List of numerical columns
numerical_columns = ["age", "Medu", "Fedu", "traveltime", "studytime", "failures", 
                     "famrel", "freetime", "goout", "Dalc", "Walc", "health", 
                     "absences", "G1", "G2", "G3"]

# Filter the DataFrame to include only numerical columns
numerical_df = df[numerical_columns]

# Function to detect outliers using the IQR method
def detect_outliers(df):
    outliers = {}
    for column in df.columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outlier_condition = ((df[column] < lower_bound) | (df[column] > upper_bound))
        outliers[column] = df[column][outlier_condition]
    return outliers

# Detect and print outliers before removal
print("\nOutliers before removal:")
outliers_before = detect_outliers(numerical_df)
for column, outliers in outliers_before.items():
    print(f"{column}: {outliers.index.tolist()}")


Outliers before removal:
age: [247]
Medu: []
Fedu: []
traveltime: [61, 108, 134, 149, 164, 228, 327, 375]
studytime: [47, 66, 67, 69, 70, 71, 76, 77, 94, 95, 105, 106, 108, 121, 140, 204, 210, 256, 259, 271, 282, 293, 298, 303, 330, 334, 338]
failures: [2, 18, 25, 40, 44, 49, 52, 72, 78, 85, 88, 95, 111, 112, 118, 127, 128, 130, 137, 138, 141, 144, 146, 149, 150, 151, 152, 153, 157, 159, 160, 161, 162, 164, 165, 170, 173, 198, 205, 206, 213, 216, 217, 221, 225, 239, 247, 248, 250, 252, 255, 270, 278, 281, 292, 304, 305, 307, 308, 309, 310, 312, 313, 314, 315, 336, 340, 341, 343, 349, 350, 352, 353, 361, 367, 370, 376, 383, 384, 387, 389, 390, 392]
famrel: [25, 27, 37, 46, 60, 66, 79, 108, 140, 141, 150, 156, 184, 206, 207, 222, 238, 240, 278, 296, 299, 349, 357, 363, 389, 391]
freetime: [7, 19, 68, 89, 95, 106, 111, 112, 168, 189, 238, 260, 276, 293, 301, 314, 315, 378, 389]
goout: []
Dalc: [29, 54, 61, 66, 100, 159, 175, 192, 211, 223, 228, 236, 247, 327, 349, 369, 384, 390]
Walc: []

In [11]:
# Function to remove outliers using the IQR method
def remove_outliers(df):
    for column in df.columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# Remove outliers
numerical_df_cleaned = remove_outliers(numerical_df)

# Detect and print outliers after removal
print("\nOutliers after removal:")
outliers_after = detect_outliers(numerical_df_cleaned)
for column, outliers in outliers_after.items():
    print(f"{column}: {outliers.index.tolist()}")




Outliers after removal:
age: []
Medu: []
Fedu: []
traveltime: []
studytime: []
failures: []
famrel: []
freetime: []
goout: []
Dalc: [23, 30, 41, 48, 50, 53, 63, 64, 73, 75, 84, 92, 125, 129, 136, 143, 148, 166, 182, 193, 197, 233, 241, 249, 254, 266, 267, 273, 275, 318, 319, 322, 323, 324, 325, 326, 337, 339, 345, 347, 351, 365, 366, 371, 377, 386, 393, 394]
Walc: []
health: []
absences: []
G1: []
G2: []
G3: []
