In [1]:
from cerberus import Validator
import pandas as pd

In [2]:
# Use Cerberus to validate matching data between housing and population
# Define schema for the "housing.csv" file
housing_schema = {
    'City': {'type': 'string', 'required': True},
    'State': {'type': 'string', 'required': True},
    # Add more validation rules as needed
}

# Define schema for the "population.csv" file
population_schema = {
    'City': {'type': 'string', 'required': True},
    'State': {'type': 'string', 'required': True},
    # Add more validation rules as needed
}

# Create Cerberus Validator instances for each schema
housing_validator = Validator(housing_schema)
population_validator = Validator(population_schema)

# Load data from "housing.csv" and "population.csv" into pandas DataFrames
housing_data = pd.read_csv("housing.csv")
population_data = pd.read_csv("population.csv")

# Validate each row of the housing data against the schema
housing_validation_results = [housing_validator.validate(row.to_dict()) for _, row in housing_data.iterrows()]

# Validate each row of the population data against the schema
population_validation_results = [population_validator.validate(row.to_dict()) for _, row in population_data.iterrows()]

# Merge the DataFrames based on the "City" and "State" columns
merged_data = housing_data.merge(population_data, on=["City", "State"], how="inner")

# Print out the matching cities and states
merged_data

Unnamed: 0,City,State,Year,Price,population_2022
0,New York,NY,2014,402362,8335897
1,New York,NY,2015,416699,8335897
2,New York,NY,2016,435647,8335897
3,New York,NY,2017,458805,8335897
4,New York,NY,2018,480549,8335897
...,...,...,...,...,...
105,Austin,TX,2020,344899,974447
106,Austin,TX,2021,455730,974447
107,Austin,TX,2022,535014,974447
108,Austin,TX,2023,478175,974447


In [3]:
# Use Cerberus to find any missing data between housing and population
# Define schema for the "housing.csv" file
housing_schema = {
    'City': {'type': 'string', 'required': True},
    'State': {'type': 'string', 'required': True},
    # Add more validation rules as needed
}

# Define schema for the "population.csv" file
population_schema = {
    'City': {'type': 'string', 'required': True},
    'State': {'type': 'string', 'required': True},
    # Add more validation rules as needed
}

# Create Cerberus Validator instances for each schema
housing_validator = Validator(housing_schema)
population_validator = Validator(population_schema)

# Load data from "housing.csv" and "population.csv" into pandas DataFrames
housing_data = pd.read_csv("housing.csv")
population_data = pd.read_csv("population.csv")

# Validate each row of the housing data against the schema
housing_validation_results = [housing_validator.validate(row.to_dict()) for _, row in housing_data.iterrows()]

# Validate each row of the population data against the schema
population_validation_results = [population_validator.validate(row.to_dict()) for _, row in population_data.iterrows()]

# Merge the DataFrames based on the "City" and "State" columns
merged_data = housing_data.merge(population_data, on=["City", "State"], how="outer", indicator=True)

# Filter out records present in only one DataFrame
missing_in_housing = merged_data[merged_data['_merge'] == 'right_only']
missing_in_population = merged_data[merged_data['_merge'] == 'left_only']

# Print out the missing records
print("Missing records in housing.csv:")
missing_in_housing

print("\Missing records in population.csv:")
missing_in_population


Missing records in housing.csv:
\Missing records in population.csv:


Unnamed: 0,City,State,Year,Price,population_2022,_merge


In [4]:
#Use matching data to generate a new housing df, focusing on the cities with the highest 10 population counts
# Merge housing and population data to get matching City and State combinations
merged_data = housing_data.merge(population_data, on=["City", "State"], how="inner")

# Filter housing data to include only matching City and State combinations
matching_housing_data = housing_data[
    (housing_data[['City', 'State']].apply(tuple, axis=1)).isin(merged_data[['City', 'State']].apply(tuple, axis=1))
]
# show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Print out the matching housing data
matching_housing_data

Unnamed: 0,City,State,Year,Price
0,New York,NY,2014,402362
1,Los Angeles,CA,2014,494696
2,Chicago,IL,2014,192162
3,Dallas,TX,2014,169951
4,Houston,TX,2014,168793
5,Philadelphia,PA,2014,204589
6,Phoenix,AZ,2014,202542
7,San Diego,CA,2014,455060
8,San Antonio,TX,2014,156013
9,Austin,TX,2014,240550


In [5]:
# Export to csv
housing = matching_housing_data
housing.to_csv('housing.csv', index=False)

In [40]:
import pandas as pd

# File paths for the CSV files
housing_csv_file = 'housingsql.csv'
population_csv_file = 'populationsql.csv'

# Read the CSV files into DataFrames
housing_df = pd.read_csv(housing_csv_file)
population_df = pd.read_csv(population_csv_file)

print("Housing DataFrame:")
housing_df

Housing DataFrame:


Unnamed: 0,city,state,year,price
0,New York,NY,2014,402362
1,Los Angeles,CA,2014,494696
2,Chicago,IL,2014,192162
3,Dallas,TX,2014,169951
4,Houston,TX,2014,168793
5,Philadelphia,PA,2014,204589
6,Phoenix,AZ,2014,202542
7,San Diego,CA,2014,455060
8,San Antonio,TX,2014,156013
9,Austin,TX,2014,240550


In [41]:
# print population df

print("Population DataFrame:")
population_df


Population DataFrame:


Unnamed: 0,city,state,population_2022
0,New York,NY,8335897
1,Los Angeles,CA,3822238
2,Chicago,IL,2665039
3,Houston,TX,2302878
4,Phoenix,AZ,1644409
5,Philadelphia,PA,1567258
6,San Antonio,TX,1472909
7,San Diego,CA,1381162
8,Dallas,TX,1299544
9,Austin,TX,974447
