# Exploratory Data Analysis of Computer Science Students' Academic Performance

In [1]:
# import libraries
import numpy as np
import pandas as pd

In [2]:
# loading the workbook into Jupyter Notebook environment
# File path and name of workbook
file_path_name = 'data/StudentRec_.xlsx'
data_workbook = pd.ExcelFile(file_path_name)

In [3]:
# listing the names of each table in the workbook
sheet_names = data_workbook.sheet_names
print(sheet_names)

['Registration', 'Biodata', 'Result', 'Courses', 'CoursesII', 'Sheet1']


In [4]:
# Read all sheets into different dataframes using sheet names
for sheet_name in sheet_names:
    globals()[sheet_name] = pd.read_excel(file_path_name, sheet_name=sheet_name)

<br>

### Viewing each Table in the dataset

In [5]:
Registration.head()

Unnamed: 0,Matric_Number,Session,Semester,Year
0,3,2000-2001,1,1.0
1,3,2000-2001,2,1.0
2,3,2002-2003,1,2.0
3,3,2002-2003,2,2.0
4,3,2003-2004,1,3.0


In [6]:
Biodata.head()

Unnamed: 0,Matric_Number,Sex,Marital_Status,Religion,State_of_Origin,Nationality,YOA
0,7,Female,Single,Christian,Osun,Nigerian,2000.0
1,8,Male,Single,Christian,,Nigerian,2000.0
2,9,Male,Single,muslim,lagos,Nigerian,2000.0
3,10,Female,Single,Christian,,Nigerian,2000.0
4,11,Male,Single,Christian,,Nigerian,2000.0


In [7]:
Result.head()

Unnamed: 0,Matric_Number,Session,Semester,Course_Code,Mark,Exam,Grade,Course_code_Key,Course_Unit
0,1,2001-2002,2,CSC204,,True,F,2001-2002CSC204,3.0
1,1,2001-2002,2,GST106,,True,F,2001-2002GST106,2.0
2,1,2001-2002,2,GST113,,True,F,2001-2002GST113,2.0
3,1,2001-2002,2,GST202,,True,F,2001-2002GST202,2.0
4,1,2001-2002,2,MAT202,,True,F,2001-2002MAT202,2.0


In [8]:
Courses.head()

Unnamed: 0,Course Code,Course Title,Course Status,Prerequisite,References,LevelSem,Old_Status,Old_Comp
0,ACC210,Principles of Accounting,o,,,200.0,1,
1,ACC220,Elements of Cost Accounting,o,,,200.0,1,
2,ACC310,,,,,300.0,E,
3,ACC320,Management Accounting I,o,,,300.0,1,
4,ACC421,Management Information Systems and Computer Ap...,e,,,400.0,1,


In [9]:
CoursesII.head()

Unnamed: 0,Course Code,Course Title,Course Status,Prerequisite,Old_Comp,LevelSem,Semester,Units
0,CSC100,Computer as a Problem-Solving Tool,c,FSC103,,100,2,3
1,CSC201,Principles of Computer Science,c,,,200,1,3
2,CSC202,Introduction to Computer Programming,c,,,200,1,3
3,CSC203,Foundations of Sequential Programs,c,,,200,2,3
4,CSC204,Data Structure and Data Management,c,,,200,2,3


In [10]:
Sheet1.head()

Unnamed: 0,Course_Code_Keys,Course Code,Session,Semester,Units,Lecturer ID,Exam Date,DupFlag
0,2001-2002ACC210,ACC210,2001-2002,2,3,,,
1,1995-1996ACC220,ACC220,1995-1996,2,3,,,
2,1997-1998ACC320,ACC320,1997-1998,2,3,,,
3,1994-1995ACC421,ACC421,1994-1995,2,3,,,
4,1997-1998ACS211,ACS211,1997-1998,1,3,,,


## Data Cleaning and Preprocessing 

### The Registration Data

In [11]:
Registration.head(10)

Unnamed: 0,Matric_Number,Session,Semester,Year
0,3,2000-2001,1,1.0
1,3,2000-2001,2,1.0
2,3,2002-2003,1,2.0
3,3,2002-2003,2,2.0
4,3,2003-2004,1,3.0
5,3,2003-2004,2,3.0
6,3,2005-2006,1,4.0
7,3,2005-2006,2,4.0
8,3,2006-2007,1,4.0
9,4,2000-2001,1,1.0


#### Data types

In [12]:
Registration.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13422 entries, 0 to 13421
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Matric_Number  13422 non-null  int64  
 1   Session        13422 non-null  object 
 2   Semester       13422 non-null  int64  
 3   Year           13419 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 419.6+ KB


<br>

The `Year` Column denotes the level of the student and should therefore be in integer form. It will therefore be converted to Interger.


Before going on to convert the Year column to integer, it is important to check for missing values and handle them appropriately


#### Missing Values

In [13]:
Registration.isnull().sum()

Matric_Number    0
Session          0
Semester         0
Year             3
dtype: int64

In [14]:
# Dropping all null entries which all appeared in the Year column and are three in number
Registration.dropna(inplace=True)

In [15]:
# Checking to see if null values have been dropped
Registration.isnull().sum()

Matric_Number    0
Session          0
Semester         0
Year             0
dtype: int64

In [16]:
# Converting the Year column to integer
Registration['Year'] = Registration['Year'].astype(int)

# checking the dataframe
Registration.head()

Unnamed: 0,Matric_Number,Session,Semester,Year
0,3,2000-2001,1,1
1,3,2000-2001,2,1
2,3,2002-2003,1,2
3,3,2002-2003,2,2
4,3,2003-2004,1,3


<br>

Since the Registration table contains data of students that registered in each semester during their programme, it is expected that each matric number repeats at least 6 times each for students who completed the programme. Any Matric number that occured less than 8 times will be dropped as those students are assumed to either dropped out or transferred to another programme and will not be considered in this analysis.

In [17]:
# Count of occurrencies of each matric number
count = Registration['Matric_Number'].value_counts()

# Selecting matric numbers that occured more than 8 times
valid_matric_number = count[count >= 8].index

Registration_cleaned = Registration[Registration['Matric_Number'].isin(valid_matric_number)]

Registration_cleaned.head()

Unnamed: 0,Matric_Number,Session,Semester,Year
0,3,2000-2001,1,1
1,3,2000-2001,2,1
2,3,2002-2003,1,2
3,3,2002-2003,2,2
4,3,2003-2004,1,3


<br>

## Biodata

In [18]:
Biodata.head(10)

Unnamed: 0,Matric_Number,Sex,Marital_Status,Religion,State_of_Origin,Nationality,YOA
0,7,Female,Single,Christian,Osun,Nigerian,2000.0
1,8,Male,Single,Christian,,Nigerian,2000.0
2,9,Male,Single,muslim,lagos,Nigerian,2000.0
3,10,Female,Single,Christian,,Nigerian,2000.0
4,11,Male,Single,Christian,,Nigerian,2000.0
5,12,Female,Single,Christian,,Nigerian,2000.0
6,13,Male,Single,Christian,,Nigerian,2000.0
7,14,Female,Single,Christian,,Nigerian,2000.0
8,15,Female,Single,Christian,Ondo,Nigerian,2000.0
9,16,Male,Single,Christian,,Nigerian,2000.0


#### Data Types

In [19]:
Biodata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1433 entries, 0 to 1432
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Matric_Number    1433 non-null   int64  
 1   Sex              1433 non-null   object 
 2   Marital_Status   1419 non-null   object 
 3   Religion         1406 non-null   object 
 4   State_of_Origin  928 non-null    object 
 5   Nationality      1417 non-null   object 
 6   YOA              1430 non-null   float64
dtypes: float64(1), int64(1), object(5)
memory usage: 78.5+ KB


<br>

The **`Year`** column is expected to be an integer and will be converted. Before going on to convert the column to Integer, it is imperative to handle missing values in the column. Missing values in the whole of the Biodata dataframe will therefore be handled next.

#### Missing Values

In [20]:
Biodata.isnull().sum()

Matric_Number        0
Sex                  0
Marital_Status      14
Religion            27
State_of_Origin    505
Nationality         16
YOA                  3
dtype: int64

The figures above shows the missing values in the dataframe. Missing values in the **Marital_Status** column will be filled with `Single` as it is rather safe to make such assumption. **Religion**, and **State_of_Origin**, will be left as is, the **Nationality** column will be filled with `Nigerian` and the missing values of **YOA** representing the `Year of Admission` will be was handled manually in the Excel file.

In [21]:
# Filling the Marital_Status column with Single
Biodata['Marital_Status'].fillna('Single', inplace=True)

# Filling Religion Column with '-'
Biodata['Religion'].fillna('-', inplace=True)

# Filling State_of_Origin Column with '-'
Biodata['State_of_Origin'].fillna('-', inplace=True)

# Filling Nationality with 'Nigerian'
Biodata['Nationality'].fillna('Nigerian', inplace=True)

#### Validating that missing values have been handled

In [22]:
# Checking for missing values
Biodata.isnull().sum()

Matric_Number      0
Sex                0
Marital_Status     0
Religion           0
State_of_Origin    0
Nationality        0
YOA                3
dtype: int64

The result above shows there are three missing values in the `YOA` column. These missing values are as a result of missing Matric number in the Registration data

#### Dropping the Missing Values and converting `YOA` column to Integer

In [23]:
# Dropping the remaining missing values
Biodata.dropna(inplace=True)

# Converting the YOA column to integer
Biodata['YOA'] = Biodata['YOA'].astype(int)

# Checking the dataframe
Biodata.head()

Unnamed: 0,Matric_Number,Sex,Marital_Status,Religion,State_of_Origin,Nationality,YOA
0,7,Female,Single,Christian,Osun,Nigerian,2000
1,8,Male,Single,Christian,-,Nigerian,2000
2,9,Male,Single,muslim,lagos,Nigerian,2000
3,10,Female,Single,Christian,-,Nigerian,2000
4,11,Male,Single,Christian,-,Nigerian,2000


#### Number of Unique Entries

To ensure consistency in the data, it is pertinent to check the unique entries in each column of the dataframe and then handle any inconsistency

In [26]:
# Looping through the columns of the dataframe to obtain the unique entries in some of the columns
exempted_columns = ['Matric_Number', 'YOA']  # Columns to exclude from the loop

# A dictionary to store the unique entries of each column
biodata_unique_entry = {}

for column in Biodata.columns:
    if column not in exempted_columns:
        biodata_unique_entry[column] = list(Biodata[column].unique())
        print(column, ':\n', Biodata[column].unique(), '\n\n')

Sex :
 ['Female' 'Male' 'female' 'MALE' 'FEMALE' 'Femal'] 


Marital_Status :
 ['Single' 'Male' 'Married' 'Maried'] 


Religion :
 ['Christian' 'muslim' 'Islam' 'Muslim' 'Muslum' 'Eckankar' 'Christain'
 'Cristian' '-' 'Kano' 'Chritsian' 'ISLAM' 'CHRISTIAN' 'Isam' 'Christan'] 


State_of_Origin :
 ['Osun' '-' 'lagos' 'Ondo' 'Edo' 'Ogun' 'Ekiti' 'Anambra' 'Rivers' 'Imo'
 'Cross-River' 'Lagos' 'Kogi' 'Oyo' 'Abia' 'Delta' 'kogi' 'Ebonyi' 'Kwara'
 'Niger' 'Akwa Ibom' 'Awa Ibom' 'Kebbi' 'C/River' 'Ijebu-Ode' 'Enugu'
 'Bayelsa' 'OYO' 'Borno' 'Bornu' 'Taraba' 'Benue' 'Gombe' 'Adamawa' 'Kano'
 'Akwa-Ibom' 'Nassarawa' 'Plateau' 'Cross River' 'EKITI' 'OSUN' 'LAGOS'
 'ONDO' 'KWARA' 'OGUN' 'NIGER' 'IMO' 'BAYELSA' 'RIVER' 'EBONY' 'Douala'
 'Ndukwe East' 'Ondoi' 'Ikorodu' 'Malabo'] 


Nationality :
 ['Nigerian' 'Camerounian' 'E. Guinea'] 




<br>

Looking at the result above, it can be observed that there are variations to the same words in terms on case and spellings. These words will be modified to ensure consistencies across the table.

In [27]:
# Mapping dictionaries
gender_mapping = {
    'Female': 'Female', 'female': 'Female', 'FEMALE': 'Female', 'Femal': 'Female',
    'Male': 'Male', 'MALE': 'Male'
}

marital_status_mapping = {
    'Single': 'Single', 'Married': 'Married', 'Maried': 'Married'
}

religion_mapping = {
    'Christian': 'Christian', 'Christain': 'Christian', 'Cristian': 'Christian', 'Chritsian': 'Christian',
    'CHRISTIAN': 'Christian', 'Christan': 'Christian', 'muslim': 'Muslim', 'Islam': 'Muslim', 
    'Muslim': 'Muslim', 'Muslum': 'Muslim', 'ISLAM': 'Muslim', 'Isam': 'Muslim', 
    'Eckankar': 'Eckankar', '-': 'Unknown', 'Kano': 'Unknown', 'Douala': 'Unknown', 
    'Ndukwe East': 'Unknown', 'Malabo': 'Unknown'
}

state_mapping = {
    'Osun': 'Osun', 'OSUN': 'Osun', 'lagos': 'Lagos', 'LAGOS': 'Lagos', 'Lagos': 'Lagos',
    'Ondo': 'Ondo', 'ONDO': 'Ondo', 'Ondoi': 'Ondo', 'Edo': 'Edo', 'Ogun': 'Ogun', 'OGUN': 'Ogun',
    'Ekiti': 'Ekiti', 'EKITI': 'Ekiti', 'Anambra': 'Anambra', 'Rivers': 'Rivers', 'RIVER': 'Rivers',
    'Imo': 'Imo', 'IMO': 'Imo', 'Cross-River': 'Cross River', 'Cross River': 'Cross River', 
    'C/River': 'Cross River', 'Kogi': 'Kogi', 'kogi': 'Kogi', 'KOGI': 'Kogi', 'Oyo': 'Oyo', 'OYO': 'Oyo', 
    'Abia': 'Abia', 'Delta': 'Delta', 'Ebonyi': 'Ebonyi', 'EBONYI': 'Ebonyi', 'Kwara': 'Kwara', 'KWARA': 'Kwara',
    'Niger': 'Niger', 'NIGER': 'Niger', 'Akwa Ibom': 'Akwa Ibom', 'Awa Ibom': 'Akwa Ibom', 
    'Akwa-Ibom': 'Akwa Ibom', 'Kebbi': 'Kebbi', 'Ijebu-Ode': 'Ijebu-Ode', 'Enugu': 'Enugu', 
    'Bayelsa': 'Bayelsa', 'BAYELSA': 'Bayelsa', 'Borno': 'Borno', 'Bornu': 'Borno', 'Taraba': 'Taraba',
    'Benue': 'Benue', 'Gombe': 'Gombe', 'Adamawa': 'Adamawa', 'Kano': 'Kano', 'Nassarawa': 'Nasarawa', 
    'Plateau': 'Plateau', 'Douala': 'Unknown', 'Ndukwe East': 'Unknown', 'Ikorodu': 'Unknown', 'Malabo': 'Unknown'
}

# Apply mappings
Biodata['Sex'] = Biodata['Sex'].replace(gender_mapping)
Biodata['Marital_Status'] = Biodata['Marital_Status'].replace(marital_status_mapping)
Biodata['Religion'] = Biodata['Religion'].replace(religion_mapping)
Biodata['State_of_Origin'] = Biodata['State_of_Origin'].replace(state_mapping)



Cleaned DataFrame:


NameError: name 'df' is not defined