In [5]:
import pandas as pd          # For data manipulation and analysis
import numpy as np           # For numerical operations
from datetime import datetime # For date and time handling

In [6]:


# Load the dataset
file_path = 'C:/Users/HP/Downloads/trestle_academy_dataset.csv'  
data = pd.read_csv(file_path)

# Preview the first few rows of the dataset
data.head(50)

Unnamed: 0,student_id,name,age,gender,course,enrollment_date,final_grade,is_intern
0,S0001,Student_1,30,Female,Data Science,2023-03-22,97,No
1,S0002,Student_2,33,Female,Data Science,2023-01-29,64,No
2,S0003,Student_3,39,Female,Data Engineering,2023-12-24,97,No
3,S0004,Student_4,18,Male,Data Engineering,2023-01-09,89,Yes
4,S0005,Student_5,21,Female,Cloud Computing,2023-05-26,65,No
5,S0006,Student_6,21,Female,Data Engineering,2023-08-29,74,No
6,S0007,Student_7,25,Male,AI Fundamentals,2023-08-25,64,Yes
7,S0008,Student_8,27,Female,AI Fundamentals,2023-08-09,60,No
8,S0009,Student_9,37,Male,AI Fundamentals,2023-06-03,54,No
9,S0010,Student_10,39,Male,AI Fundamentals,2023-12-17,70,No


In [7]:
data.shape

(1000, 8)

In [8]:
missing_values = data.isnull().sum()
print("Missing values per column:\n", missing_values)


Missing values per column:
 student_id         0
name               0
age                0
gender             0
course             0
enrollment_date    0
final_grade        0
is_intern          0
dtype: int64


In [9]:
print("Remaining missing values:\n", data.isnull().sum())

Remaining missing values:
 student_id         0
name               0
age                0
gender             0
course             0
enrollment_date    0
final_grade        0
is_intern          0
dtype: int64


In [10]:
# Ensure 'age' is an integer
data['age'] = data['age'].astype(int)

# Convert 'enrollment_date' to datetime format
data['enrollment_date'] = pd.to_datetime(data['enrollment_date'], errors='coerce')

# Check and display the data types after conversion
print("Data types after standardization:\n", data.dtypes)

Data types after standardization:
 student_id                 object
name                       object
age                         int64
gender                     object
course                     object
enrollment_date    datetime64[ns]
final_grade                 int64
is_intern                  object
dtype: object


In [7]:
# Standardize 'course' column to title case
data['course'] = data['course'].str.title()

# Display the first few rows to verify changes
print(data[['course']].head())

             course
0      Data Science
1      Data Science
2  Data Engineering
3  Data Engineering
4   Cloud Computing


In [12]:
# Filter rows where age is within the range of 18 to 45
data = data[(data['age'] >= 18) & (data['age'] <= 45)]

# Display the shape of the dataset after filtering
print("Dataset shape after filtering:", data.shape)

# Preview the data to confirm
print(data.head())

Dataset shape after filtering: (1000, 8)
  student_id       name  age  gender            course enrollment_date  \
0      S0001  Student_1   30  Female      Data Science      2023-03-22   
1      S0002  Student_2   33  Female      Data Science      2023-01-29   
2      S0003  Student_3   39  Female  Data Engineering      2023-12-24   
3      S0004  Student_4   18    Male  Data Engineering      2023-01-09   
4      S0005  Student_5   21  Female   Cloud Computing      2023-05-26   

   final_grade is_intern  
0           97        No  
1           64        No  
2           97        No  
3           89       Yes  
4           65        No  


In [13]:
# Standardize the 'is_intern' column to have consistent "Yes" or "No" values
data['is_intern'] = data['is_intern'].str.strip().str.capitalize()

# Verify that all values are now standardized
print("Unique values in 'is_intern' column after standardization:", data['is_intern'].unique())

# Preview the data to confirm changes
print(data[['is_intern']].head())

Unique values in 'is_intern' column after standardization: ['No' 'Yes']
  is_intern
0        No
1        No
2        No
3       Yes
4        No


In [14]:
# Save the cleaned dataset to a new CSV file
cleaned_file_path = 'cleaned_trestle_academy_dataset.csv'  # Specify your desired file path
data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned dataset saved to {cleaned_file_path}")

Cleaned dataset saved to cleaned_trestle_academy_dataset.csv


In [15]:
data.head()

Unnamed: 0,student_id,name,age,gender,course,enrollment_date,final_grade,is_intern
0,S0001,Student_1,30,Female,Data Science,2023-03-22,97,No
1,S0002,Student_2,33,Female,Data Science,2023-01-29,64,No
2,S0003,Student_3,39,Female,Data Engineering,2023-12-24,97,No
3,S0004,Student_4,18,Male,Data Engineering,2023-01-09,89,Yes
4,S0005,Student_5,21,Female,Cloud Computing,2023-05-26,65,No
