Cleaning Data Demos

In [95]:
import pandas as pd
import numpy as np

In [96]:
students = pd.read_csv("student_records_dirty.csv")

In [97]:
students.head()

Unnamed: 0,student_id,first_name,last_name,age,major,enrollment_date,course_grade,gpa,tuition_paid
0,17270,Taylor,Chen,23,Bio,2/28/2024,12,3.8,17548.1
1,10860,Olivia,Martinez,73,Computer Science,30-Jan-24,12,2.7,13129.13
2,15390,Priya,Garcia,52,bio,2/10/2024,95,1.9,19051.36
3,15191,Priya,Anderson,61,CS,2/8/2024,3,2.0,17242.61
4,15734,Noah,Wilson,36,Comp Sci,8-Feb-24,-20,3.2,


In [98]:
#completely random info every time you roll
students.sample(5)

Unnamed: 0,student_id,first_name,last_name,age,major,enrollment_date,course_grade,gpa,tuition_paid
255,14199,Alex,Patel,43,Biology,3/7/2024,95,3.8,21642.7
291,17022,Evan,Anderson,69,Bio,5/24/2024,12,3.8,18882.82
337,10784,Ben,Anderson,26,bio,20-Apr-24,ninety,1.9,19474.87
49,17099,Noah,Smith,25,Computer Science,17-Feb-24,52,2.0,21412.0
97,15892,Aisha,Lopez,49,BIO,12-Feb-24,0,,19281.17


In [99]:
students.isna().sum()

student_id          0
first_name          0
last_name           0
age                 0
major               0
enrollment_date     0
course_grade       29
gpa                94
tuition_paid       21
dtype: int64

In [100]:
students["course_grade"] = students["course_grade"].replace("\"N/A\"", np.nan)

In [101]:
students["course_grade"].value_counts(dropna=False)

course_grade
100         36
12          34
NaN         32
 ninety     32
95          27
-20         27
0           27
52          25
64          25
85          25
88          24
3           22
76          19
Name: count, dtype: int64

In [102]:
students = students.drop_duplicates()

In [103]:
students.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
345    False
346    False
347    False
348    False
349    False
Length: 350, dtype: bool

In [104]:
students["major"] = students["major"].str.strip().str.capitalize()

students["major"] = students["major"].replace({
    "Comp sci" : "Computer science",
    "Cs" : "Computer science",
    "Computer science" : "Computer Science",
    "Bio" : "Biology",
    "Psych" : "Psychology"
})

In [105]:
students["major"].value_counts()

major
Biology             117
Psychology          115
Computer science     68
Computer Science     50
Name: count, dtype: int64

In [106]:
students["course_grade"].value_counts()

course_grade
100         36
12          34
 ninety     31
0           27
95          26
-20         26
64          25
52          25
85          24
88          24
3           21
76          19
Name: count, dtype: int64

In [107]:
students["course_grade"] = students["course_grade"].str.strip()
students.loc[students["course_grade"] == "ninety", "course_grade"] = 90

In [108]:
students["course_grade"].value_counts()

course_grade
100    36
12     34
90     31
0      27
95     26
-20    26
64     25
52     25
85     24
88     24
3      21
76     19
Name: count, dtype: int64

In [109]:
students["age"].value_counts(dropna=False)

age
 74    12
 32    11
 33     9
 85     9
 79     8
       ..
 87     2
-12     2
 47     2
 57     1
 27     1
Name: count, Length: 75, dtype: int64

In [110]:
students.loc[students["age"] < 0, "age"] = np.nan
students.loc[students["age"] > 110, "age"] = np.nan


In [111]:
students['age'].describe()

count    345.000000
mean      52.617391
std       20.951321
min       17.000000
25%       33.000000
50%       52.000000
75%       71.000000
max       89.000000
Name: age, dtype: float64

In [112]:
#create a columnb combining first and last names
students["full_name"] = students["last_name"] + ', ' + students["first_name"]
students.head()

Unnamed: 0,student_id,first_name,last_name,age,major,enrollment_date,course_grade,gpa,tuition_paid,full_name
0,17270,Taylor,Chen,23.0,Biology,2/28/2024,12,3.8,17548.1,"Chen, Taylor"
1,10860,Olivia,Martinez,73.0,Computer Science,30-Jan-24,12,2.7,13129.13,"Martinez, Olivia"
2,15390,Priya,Garcia,52.0,Biology,2/10/2024,95,1.9,19051.36,"Garcia, Priya"
3,15191,Priya,Anderson,61.0,Computer science,2/8/2024,3,2.0,17242.61,"Anderson, Priya"
4,15734,Noah,Wilson,36.0,Computer science,8-Feb-24,-20,3.2,,"Wilson, Noah"


In [113]:
students["age_group"] = pd.cut(
    students["age"],
    bins=[0, 25, 65, 120],
    labels=["Young Adults", "Working Adults", "Retired"],
)

In [114]:
students[["age", "age_group"]].value_counts(dropna=False)

age   age_group     
74.0  Retired           12
32.0  Working Adults    11
33.0  Working Adults     9
85.0  Retired            9
79.0  Retired            8
                        ..
64.0  Working Adults     2
86.0  Retired            2
87.0  Retired            2
57.0  Working Adults     1
27.0  Working Adults     1
Name: count, Length: 74, dtype: int64

In [115]:
students["enrollment_date"] = pd.to_datetime(students["enrollment_date"])

ValueError: time data "30-Jan-24" doesn't match format "%m/%d/%Y", at position 1. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.