In [69]:
import pandas as pd

# Load both sheets
enrollment_df = pd.read_excel('university_data.xlsx', sheet_name='Enrollment')
tuition_df = pd.read_excel('university_data.xlsx', sheet_name='Tuition')

# View basic information
print(enrollment_df.shape)  # Check number of rows and columns
print(tuition_df.shape)

# Preview the data
print(enrollment_df.head())
print(tuition_df.head())


(6, 5)
(5, 4)
            University  total_students  undergrad_percent  grad_percent  \
0  Stanford University           16937               65.2          34.8   
1   harvard university           20050               62.8          37.2   
2                MIT             11376               54.5          45.5   
3                 Yale           12060               58.3          41.7   
4   harvard university           20050               62.8          37.2   

   International students  
0                    3984  
1                    5726  
2                    3787  
3                    2841  
4                    5726  
            university  undergrad_tuition  grad_tuition  Average_financial_aid
0  Stanford University              56169         54315                  52030
1   harvard university              54002         51654                  53604
2                MIT                55878         58240                  48264
3                 Yale              59950         4

In [70]:
# Examine column names
print(enrollment_df.columns)
print(tuition_df.columns)

# Check data types
print(enrollment_df.dtypes)
print(tuition_df.dtypes)


Index(['University', 'total_students', 'undergrad_percent', 'grad_percent',
       'International students'],
      dtype='object')
Index(['university', 'undergrad_tuition', 'grad_tuition',
       'Average_financial_aid'],
      dtype='object')
University                 object
total_students              int64
undergrad_percent         float64
grad_percent              float64
International students      int64
dtype: object
university               object
undergrad_tuition         int64
grad_tuition              int64
Average_financial_aid     int64
dtype: object


In [71]:
# Check for duplicate rows in the enrollment dataframe
print(enrollment_df.duplicated())
print("Duplicate rows in enrollment data:")
print(enrollment_df[enrollment_df.duplicated()])


# Check for duplicate rows in the tuition dataframe
print(tuition_df.duplicated())
print("Duplicate rows in tuition data:")
print(tuition_df[tuition_df.duplicated()])


0    False
1    False
2    False
3    False
4     True
5    False
dtype: bool
Duplicate rows in enrollment data:
           University  total_students  undergrad_percent  grad_percent  \
4  harvard university           20050               62.8          37.2   

   International students  
4                    5726  
0    False
1    False
2    False
3    False
4    False
dtype: bool
Duplicate rows in tuition data:
Empty DataFrame
Columns: [university, undergrad_tuition, grad_tuition, Average_financial_aid]
Index: []


In [72]:
# Check unique values in all columns
for column in enrollment_df.columns:
    print(f"Column: {column}")
    print(enrollment_df[column].unique())
    print("---")

for column in tuition_df.columns:
    print(f"Column: {column}")
    print(tuition_df[column].unique())
    print("---")


Column: University
['Stanford University' 'harvard university' 'MIT  ' '  Yale' 'UC Berkeley']
---
Column: total_students
[16937 20050 11376 12060 42501]
---
Column: undergrad_percent
[65.2 62.8 54.5 58.3 71.5]
---
Column: grad_percent
[34.8 37.2 45.5 41.7 28.5]
---
Column: International students
[3984 5726 3787 2841 6763]
---
Column: university
['Stanford University' 'harvard university' 'MIT  ' '  Yale' 'UC Berkeley']
---
Column: undergrad_tuition
[56169 54002 55878 59950 14226]
---
Column: grad_tuition
[54315 51654 58240 45700 14226]
---
Column: Average_financial_aid
[52030 53604 48264 55879 23736]
---


In [73]:
# Original column names
print("Original enrollment columns:", enrollment_df.columns.tolist())
print("Original tuition columns:", tuition_df.columns.tolist())

# Clean column names - convert to lowercase, strip whitespace, and replace spaces with underscores
enrollment_df.columns = [col.lower().strip().replace(' ', '_') for col in enrollment_df.columns]
tuition_df.columns = [col.lower().strip().replace(' ', '_') for col in tuition_df.columns]

# Print the cleaned column names
print("Cleaned enrollment columns:", enrollment_df.columns.tolist())
print("Cleaned tuition columns:", tuition_df.columns.tolist())

Original enrollment columns: ['University', 'total_students', 'undergrad_percent', 'grad_percent', 'International students']
Original tuition columns: ['university', 'undergrad_tuition', 'grad_tuition', 'Average_financial_aid']
Cleaned enrollment columns: ['university', 'total_students', 'undergrad_percent', 'grad_percent', 'international_students']
Cleaned tuition columns: ['university', 'undergrad_tuition', 'grad_tuition', 'average_financial_aid']


In [77]:
# Strip whitespace and standardize case for univeresity names
enrollment_df['university'] = enrollment_df['university'].str.strip().str.lower()
tuition_df['university'] = tuition_df['university'].str.strip().str.lower()

# note what happens to UC Berkeley...

print(enrollment_df['university'])
print(tuition_df['university'])

0    stanford university
1     harvard university
2                    mit
3                   yale
5            uc berkeley
Name: university, dtype: object
0    stanford university
1     harvard university
2                    mit
3                   yale
4            uc berkeley
Name: university, dtype: object


In [78]:
# Remove duplicate rows based on a column
enrollment_df = enrollment_df.drop_duplicates(subset=['university'])

print(enrollment_df['university'])

0    stanford university
1     harvard university
2                    mit
3                   yale
5            uc berkeley
Name: university, dtype: object


In [79]:
# Merge the enrollment and tuition data on the cleaned university name
merged_df = pd.merge(
    enrollment_df,
    tuition_df,
    on='university',
    how='outer'  # Use outer join to keep all universities from both sheets
)

print(merged_df.head())

            university  total_students  undergrad_percent  grad_percent  \
0   harvard university           20050               62.8          37.2   
1                  mit           11376               54.5          45.5   
2  stanford university           16937               65.2          34.8   
3          uc berkeley           42501               71.5          28.5   
4                 yale           12060               58.3          41.7   

   international_students  undergrad_tuition  grad_tuition  \
0                    5726              54002         51654   
1                    3787              55878         58240   
2                    3984              56169         54315   
3                    6763              14226         14226   
4                    2841              59950         45700   

   average_financial_aid  
0                  53604  
1                  48264  
2                  52030  
3                  23736  
4                  55879  


In [None]:
# Save the data to CSV file
merged_df.to_csv('university_data_merged.csv', index=False)