## LOAD DATASET

In [1]:
import pandas as pd

In [7]:
# Declaring the file path to the dataset.
file_path = "trestle_academy_dataset.csv"

# Reading the content of the dataset using pandas
data = pd.read_csv(file_path)

In [13]:
# Previewing the first few rows
data.head()

Unnamed: 0,student_id,name,age,gender,course,enrollment_date,final_grade,is_intern
0,S0001,Student_1,30,Female,Data Science,2023-03-22,97,No
1,S0002,Student_2,33,Female,Data Science,2023-01-29,64,No
2,S0003,Student_3,39,Female,Data Engineering,2023-12-24,97,No
3,S0004,Student_4,18,Male,Data Engineering,2023-01-09,89,Yes
4,S0005,Student_5,21,Female,Cloud Computing,2023-05-26,65,No


In [24]:
# Displaying information about the dataset
data.isna()

Unnamed: 0,student_id,name,age,gender,course,enrollment_date,final_grade,is_intern
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False
996,False,False,False,False,False,False,False,False
997,False,False,False,False,False,False,False,False
998,False,False,False,False,False,False,False,False


# Identify and Handle Missing Values

In [17]:
# Displaying information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   student_id       1000 non-null   object
 1   name             1000 non-null   object
 2   age              1000 non-null   int64 
 3   gender           1000 non-null   object
 4   course           1000 non-null   object
 5   enrollment_date  1000 non-null   object
 6   final_grade      1000 non-null   int64 
 7   is_intern        1000 non-null   object
dtypes: int64(2), object(6)
memory usage: 62.6+ KB


## STANDARDIZE DATA TYPES
* Convert *enrollment_date* to a date format

In [25]:
# Converting enrollment_date to a date format
data['enrollment_date'] = pd.to_datetime(data['enrollment_date'], errors='coerce')

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   student_id       1000 non-null   object        
 1   name             1000 non-null   object        
 2   age              1000 non-null   int64         
 3   gender           1000 non-null   object        
 4   course           1000 non-null   object        
 5   enrollment_date  1000 non-null   datetime64[ns]
 6   final_grade      1000 non-null   int64         
 7   is_intern        1000 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 62.6+ KB


## Normalizing Text Data

In [43]:
# Modifying these sections to allow for consistency across the datasets.
data['course'] = data['course'].str.title()
data['gender'] = data['gender'].str.upper()
data['is_intern'] = data['is_intern'].str.upper()

In [50]:
data.head()

Unnamed: 0,student_id,name,age,gender,course,enrollment_date,final_grade,is_intern
0,S0001,Student_1,30,FEMALE,Data Science,2023-03-22,97,NO
1,S0002,Student_2,33,FEMALE,Data Science,2023-01-29,64,NO
2,S0003,Student_3,39,FEMALE,Data Engineering,2023-12-24,97,NO
3,S0004,Student_4,18,MALE,Data Engineering,2023-01-09,89,YES
4,S0005,Student_5,21,FEMALE,Cloud Computing,2023-05-26,65,NO


In [49]:
data.tail()

Unnamed: 0,student_id,name,age,gender,course,enrollment_date,final_grade,is_intern
995,S0996,Student_996,25,FEMALE,Data Engineering,2023-05-25,52,YES
996,S0997,Student_997,40,MALE,Ai Fundamentals,2023-08-15,90,NO
997,S0998,Student_998,20,FEMALE,Cloud Computing,2023-04-22,100,NO
998,S0999,Student_999,40,FEMALE,Cloud Computing,2023-07-04,67,NO
999,S1000,Student_1000,34,MALE,Ai Fundamentals,2023-09-09,90,YES


## Filter unwanted data based on age range
* Remove any rows where age is outside 18-45.

In [66]:
# Remove any rows where age is outside 18-45.
data = data[(data['age'] >= 18) & (data['age'] <= 45)]

## Correct Inconsistent Entries
* Standardize values in *is_intern* (e.g., "Yes" or "No").


In [75]:
# Standardize values in is_intern (e.g., "Yes" or "No").
data['is_intern'] = data['is_intern'].apply(lambda x: 'Yes' if x.lower() == 'yes' else 'No')

In [76]:
data.head()

Unnamed: 0,student_id,name,age,gender,course,enrollment_date,final_grade,is_intern
0,S0001,Student_1,30,FEMALE,Data Science,2023-03-22,97,No
1,S0002,Student_2,33,FEMALE,Data Science,2023-01-29,64,No
2,S0003,Student_3,39,FEMALE,Data Engineering,2023-12-24,97,No
3,S0004,Student_4,18,MALE,Data Engineering,2023-01-09,89,No
4,S0005,Student_5,21,FEMALE,Cloud Computing,2023-05-26,65,No


## SAVE CLEANED DATA

In [79]:
cleaned_file_path = ('cleaned_trestle_academy_dataset.csv')
data.to_csv(cleaned_file_path, index='False')