# Import all required packages

In [3]:
import sys

import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

# check system information

In [7]:
print('System Information', sys.version)
print('Python Version', sys.version_info)
print('Pandas Version', pd.__version__)
print('Numpy Version', np.__version__) 
print('seaborn Version', sns.__version__)


System Information 3.11.5 (main, Sep 11 2023, 08:31:25) [Clang 14.0.6 ]
Python Version sys.version_info(major=3, minor=11, micro=5, releaselevel='final', serial=0)
Pandas Version 2.0.3
Numpy Version 1.24.3
seaborn Version 0.12.2


# Load Csv File & Look at the features

In [10]:
df = pd.read_csv('StudentsPerformance.csv')

df.head(5)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


# check the shape

In [11]:
df.shape

(1000, 8)

# Date Checks to Perform
# 1. Check Missing Values
# 2. Check Duplicates
# 3. Check Data Type
# 4. Check the number of unique values in each columns
# 5. Check the statistics of the data set
# 6. Check various categories present in the different categorical column


In [12]:
# check missing values
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [13]:
# check duplicates
df.duplicated().sum()

0

In [14]:
# check the Data Types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [15]:
# check the number of unique values in each columns
df.nunique()

gender                          2
race/ethnicity                  5
parental level of education     6
lunch                           2
test preparation course         2
math score                     81
reading score                  72
writing score                  77
dtype: int64

# Check Statistics of the Data Set
    # The numerical data shown above shows that all means are fairly similar to one another, falling between 66 to 68.05
    # The range of all standard deviations, between 14.6 and 15.19, is also narrow.
    # While there is a minimum score of 0 for math, the minimums for writing and reading are substantially higher at 10 and 17, respectivly.
    # We don't have any duplicate or missing values, and the following code will provide a good data checking.

# Exploring Data

In [19]:
print("Categories in 'gender' variable: ", end = " ")
print(df["gender"].unique())

print("Categories in 'race/ethnicity' variable: ", end = " ")
print(df["race/ethnicity"].unique())

print("Categories in 'parental level of education' variable: ", end = " ")
print(df["parental level of education"].unique())

print("Categories in 'lunch' variable: ", end = " ")
print(df["lunch"].unique())

print("Categories in 'test preparation course' variable: ", end = " ")
print(df["test preparation course"].unique())


Categories in 'gender' variable:  ['female' 'male']
Categories in 'race/ethnicity' variable:  ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in 'parental level of education' variable:  ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in 'lunch' variable:  ['standard' 'free/reduced']
Categories in 'test preparation course' variable:  ['none' 'completed']


# We define the numerical and categorical columns

In [25]:
numerical_features = [feature for feature in df.columns if df[feature].dtype != "object"]
categorical_feature = [feature for feature in df.columns if df[feature].dtype == 'object']

print("We have {} numerical features : {}".format(len(numerical_features), numerical_features))
print("We have {} categorical features : {}".format(len(categorical_feature), categorical_feature))

We have 3 numerical features : ['math score', 'reading score', 'writing score']
We have 5 categorical features : ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
