In [None]:
# Solution to the Hands-On Exercise

In [1]:
import pandas as pd

# Part 1: Working with the Tab-Delimited File
print("WORKING WITH TAB-DELIMITED FILE")
print("-------------------------------")

# Load the historical figures data
historical_df = pd.read_csv("historical_figures.txt", sep="\t")
print("Original data:")
print(historical_df.head())


WORKING WITH TAB-DELIMITED FILE
-------------------------------
Original data:
    First   name    last_name  birth_Year     Field_of_work  \
0            Ada     Lovelace        1815       Mathematics   
1           Alan       Turing        1912  computer Science   
2         Grace        hopper        1906  computer science   
3           John  von Neumann        1903       mathematics   
4    Katherine        JOHNSON        1918       mathematics   

       Notable_contribution  
0          First programmer  
1            TURING MACHINE  
2            cobol language  
3  Von Neumann architecture  
4         NASA calculations  


In [2]:

# Step 1: Clean column names
# Strip whitespace and convert to lowercase
historical_df.columns = [col.strip().lower().replace('_', ' ') for col in historical_df.columns]
print("\nAfter cleaning column names:")
print(historical_df.columns)



After cleaning column names:
Index(['first   name', 'last name', 'birth year', 'field of work',
       'notable contribution'],
      dtype='object')


In [3]:

# Step 2: Clean the first name column
# Remove leading and trailing whitespace
historical_df['first   name'] = historical_df['first   name'].str.strip()
print("\nAfter cleaning first names:")
print(historical_df['first   name'])




After cleaning first names:
0          Ada
1         Alan
2        Grace
3         John
4    Katherine
Name: first   name, dtype: object


In [None]:
# Step 3: Clean the last name column
# Make all last names consistent with Title Case
historical_df['last_name'] = historical_df['last_name'].str.title()
print("\nAfter standardizing last names:")
print(historical_df['last_name'])

# Step 4: Rename columns to be more descriptive
historical_df = historical_df.rename(columns={
    'first   name': 'first_name',
    'field_of_work': 'discipline',
    'notable_contribution': 'major_achievement'
})
print("\nAfter renaming columns:")
print(historical_df.columns)

# Final cleaned data
print("\nFinal cleaned historical figures data:")
print(historical_df)

# Part 2: Working with the Excel File
print("\n\nWORKING WITH EXCEL FILE")
print("----------------------")

# See what sheets are available
excel_file = pd.ExcelFile("university_data.xlsx")
print("Available sheets:")
print(excel_file.sheet_names)

# Load the enrollment data
enrollment_df = pd.read_excel("university_data.xlsx", sheet_name="Enrollment")
print("\nOriginal enrollment data:")
print(enrollment_df.head())

# Step 1: Clean university names
# Strip whitespace and standardize case
enrollment_df['university'] = enrollment_df['university'].str.strip().str.title()
print("\nAfter cleaning university names:")
print(enrollment_df['university'])

# Step 2: Clean the "International students" column name
enrollment_df = enrollment_df.rename(columns={'International   students': 'international_students'})

# Step 3: Create a new column showing the ratio of international to total students
enrollment_df['international_ratio'] = (enrollment_df['international_students'] / enrollment_df['total_students'] * 100).round(1)
print("\nAdded a new calculated column:")
print(enrollment_df[['university', 'international_students', 'total_students', 'international_ratio']])

# Final cleaned enrollment data
print("\nFinal cleaned enrollment data:")
print(enrollment_df)

# Bonus: Load and clean the tuition data
tuition_df = pd.read_excel("university_data.xlsx", sheet_name="Tuition")
print("\nOriginal tuition data:")
print(tuition_df.head())

# Clean university names to match the enrollment data
tuition_df['university'] = tuition_df['university'].str.strip().str.title()

# Rename columns for consistency
tuition_df = tuition_df.rename(columns={
    'undergrad_tuition': 'undergraduate_tuition',
    'grad_tuition': 'graduate_tuition',
    'Average_financial_aid': 'average_financial_aid'
})

# Final cleaned tuition data
print("\nFinal cleaned tuition data:")
print(tuition_df)

# Bonus: Merge the two datasets
print("\nMerging enrollment and tuition data:")
merged_df = pd.merge(enrollment_df, tuition_df, on='university')
print(merged_df.head())

In [None]:
# Part 1: Working with the Tab-Delimited File
print("WORKING WITH TAB-DELIMITED FILE")
print("-------------------------------")

# Load the historical figures data
historical_df = pd.read_csv("historical_figures.txt", sep="\t")
print("Original data:")
print(historical_df.head())

# Step 1: Clean column names
# Strip whitespace and convert to lowercase
historical_df.columns = [col.strip().lower().replace('_', ' ') for col in historical_df.columns]
print("\nAfter cleaning column names:")
print(historical_df.columns)

# Step 2: Clean the first name column
# Remove leading and trailing whitespace
historical_df['first   name'] = historical_df['first   name'].str.strip()
print("\nAfter cleaning first names:")
print(historical_df['first   name'])

# Step 3: Clean the last name column
# Make all last names consistent with Title Case
historical_df['last_name'] = historical_df['last_name'].str.title()
print("\nAfter standardizing last names:")
print(historical_df['last_name'])

# Step 4: Rename columns to be more descriptive
historical_df = historical_df.rename(columns={
    'first   name': 'first_name',
    'field_of_work': 'discipline',
    'notable_contribution': 'major_achievement'
})
print("\nAfter renaming columns:")
print(historical_df.columns)

# Final cleaned data
print("\nFinal cleaned historical figures data:")
print(historical_df)

# Part 2: Working with the Excel File
print("\n\nWORKING WITH EXCEL FILE")
print("----------------------")

# See what sheets are available
excel_file = pd.ExcelFile("university_data.xlsx")
print("Available sheets:")
print(excel_file.sheet_names)

# Load the enrollment data
enrollment_df = pd.read_excel("university_data.xlsx", sheet_name="Enrollment")
print("\nOriginal enrollment data:")
print(enrollment_df.head())

# Step 1: Clean university names
# Strip whitespace and standardize case
enrollment_df['university'] = enrollment_df['university'].str.strip().str.title()
print("\nAfter cleaning university names:")
print(enrollment_df['university'])

# Step 2: Clean the "International students" column name
enrollment_df = enrollment_df.rename(columns={'International   students': 'international_students'})

# Step 3: Create a new column showing the ratio of international to total students
enrollment_df['international_ratio'] = (enrollment_df['international_students'] / enrollment_df['total_students'] * 100).round(1)
print("\nAdded a new calculated column:")
print(enrollment_df[['university', 'international_students', 'total_students', 'international_ratio']])

# Final cleaned enrollment data
print("\nFinal cleaned enrollment data:")
print(enrollment_df)

# Bonus: Load and clean the tuition data
tuition_df = pd.read_excel("university_data.xlsx", sheet_name="Tuition")
print("\nOriginal tuition data:")
print(tuition_df.head())

# Clean university names to match the enrollment data
tuition_df['university'] = tuition_df['university'].str.strip().str.title()

# Rename columns for consistency
tuition_df = tuition_df.rename(columns={
    'undergrad_tuition': 'undergraduate_tuition',
    'grad_tuition': 'graduate_tuition',
    'Average_financial_aid': 'average_financial_aid'
})

# Final cleaned tuition data
print("\nFinal cleaned tuition data:")
print(tuition_df)

# Bonus: Merge the two datasets
print("\nMerging enrollment and tuition data:")
merged_df = pd.merge(enrollment_df, tuition_df, on='university')
print(merged_df.head())