In [3]:
import pandas as pd

In [2]:
df = pd.read_csv("real-estate-listings.csv")

In [3]:
# Remove the 'id' column
df = df.drop(columns=['id'])

In [4]:
# List of columns to convert to integers
columns_to_convert = ['bed', 'bath', 'zip_code', 'house_size', 'price']

# Convert specified columns to integer, handling non-null values correctly
for column in columns_to_convert:
    # Remove decimals by converting to integers for non-null values
    df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0).astype(int)

print(df.head())

     status  bed  bath  acre_lot        city        state  zip_code  \
0  for_sale    3     2      0.12    Adjuntas  Puerto Rico       601   
1  for_sale    4     2      0.08    Adjuntas  Puerto Rico       601   
2  for_sale    2     1      0.15  Juana Diaz  Puerto Rico       795   
3  for_sale    4     2      0.10       Ponce  Puerto Rico       731   
4  for_sale    6     2      0.05    Mayaguez  Puerto Rico       680   

   house_size prev_sold_date   price  
0         920            NaN  105000  
1        1527            NaN   80000  
2         748            NaN   67000  
3        1800            NaN  145000  
4           0            NaN   65000  


In [5]:
# Remove rows where 'price' is zero (which originally were NaNs and got converted to zero)
df = df[df['price'] != 0]

In [6]:
# Remove null rows 
df = df.dropna(subset=['city', 'state', 'acre_lot'])

In [7]:
# Remove rows where 'house_size' is 0
df = df[df['house_size'] != 0]

In [8]:
# Remove rows where 'zip_code' or 'bed' or 'bath' is 0
df = df[(df['zip_code'] != 0) & (df['bath'] != 0) & (df['bed'] != 0)]

print(df.head())

     status  bed  bath  acre_lot           city        state  zip_code  \
0  for_sale    3     2      0.12       Adjuntas  Puerto Rico       601   
1  for_sale    4     2      0.08       Adjuntas  Puerto Rico       601   
2  for_sale    2     1      0.15     Juana Diaz  Puerto Rico       795   
3  for_sale    4     2      0.10          Ponce  Puerto Rico       731   
5  for_sale    4     3      0.46  San Sebastian  Puerto Rico       612   

   house_size prev_sold_date   price  
0         920            NaN  105000  
1        1527            NaN   80000  
2         748            NaN   67000  
3        1800            NaN  145000  
5        2520            NaN  179000  


In [9]:
df = df.drop(columns=['status', 'prev_sold_date'])

print(df.head())

   bed  bath  acre_lot           city        state  zip_code  house_size  \
0    3     2      0.12       Adjuntas  Puerto Rico       601         920   
1    4     2      0.08       Adjuntas  Puerto Rico       601        1527   
2    2     1      0.15     Juana Diaz  Puerto Rico       795         748   
3    4     2      0.10          Ponce  Puerto Rico       731        1800   
5    4     3      0.46  San Sebastian  Puerto Rico       612        2520   

    price  
0  105000  
1   80000  
2   67000  
3  145000  
5  179000  


In [10]:
# Convert 'city' and 'state' columns to lowercase
df['city'] = df['city'].str.lower()
df['state'] = df['state'].str.lower()

print(df.head())

   bed  bath  acre_lot           city        state  zip_code  house_size  \
0    3     2      0.12       adjuntas  puerto rico       601         920   
1    4     2      0.08       adjuntas  puerto rico       601        1527   
2    2     1      0.15     juana diaz  puerto rico       795         748   
3    4     2      0.10          ponce  puerto rico       731        1800   
5    4     3      0.46  san sebastian  puerto rico       612        2520   

    price  
0  105000  
1   80000  
2   67000  
3  145000  
5  179000  


In [11]:
# State full names mapping 
valid_states = {
    'alabama', 'alaska', 'arizona', 'arkansas', 'california',
    'colorado', 'connecticut', 'delaware', 'district of columbia', 'florida', 'georgia',
    'hawaii', 'idaho', 'illinois', 'indiana', 'iowa',
    'kansas', 'kentucky', 'louisiana', 'maine', 'maryland',
    'massachusetts', 'michigan', 'minnesota', 'mississippi',
    'missouri', 'montana', 'nebraska', 'nevada', 'new hampshire',
    'new jersey', 'new mexico', 'new york', 'north carolina',
    'north dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania',
    'rhode island', 'south carolina', 'south dakota', 'tennessee',
    'texas', 'utah', 'vermont', 'virginia', 'washington',
    'west virginia', 'wisconsin', 'wyoming'
}

# Filter the DataFrame to include only rows where the state is in the set of valid states
df = df[df['state'].isin(valid_states)]

print(df.head())

       bed  bath  acre_lot    city          state  zip_code  house_size  \
24201    2     1      0.34  agawam  massachusetts      1001         676   
24206    3     1      0.46  agawam  massachusetts      1001        1196   
24207    3     3      0.45  agawam  massachusetts      1001        2314   
24208    3     2      0.36  agawam  massachusetts      1001        1276   
24211    4     2      0.11  agawam  massachusetts      1001        1732   

        price  
24201  180000  
24206  239900  
24207  525000  
24208  289900  
24211  275000  


In [12]:
# Function to convert "saint " to "st. " in city names
def convert_saint_to_st(city_name):
    if isinstance(city_name, str):
        # Replace "saint " with "st. " (case-sensitive)
        return city_name.replace("saint ", "st. ")
    return city_name

# Apply the function to the 'city' column
df['city'] = df['city'].apply(convert_saint_to_st)

In [13]:
# Save the filtered DataFrame to a new CSV file
df.to_csv('clean-real-estate-listings.csv', index=False)

In [4]:
# Generate csv to import into real_estate_listing table in database

# Load the cost of living data
real_estate_df = pd.read_csv("clean-real-estate-listings.csv")

# Load city data
city_df = pd.read_csv("city-table.csv")
city_df.rename(columns={'id': 'city_id'}, inplace=True)

# Merge the DataFrames to map city_id based on city and state
merged_df = pd.merge(real_estate_df, city_df, on=['city', 'state'], how='left')  

# Check for any entries that did not get a city_id (these will need to be addressed)
print(merged_df[merged_df['city_id'].isnull()])

print(merged_df.head())

Empty DataFrame
Columns: [bed, bath, acre_lot, city, state, zip_code, house_size, price, city_id]
Index: []
   bed  bath  acre_lot    city          state  zip_code  house_size   price  \
0    2     1      0.34  agawam  massachusetts      1001         676  180000   
1    3     1      0.46  agawam  massachusetts      1001        1196  239900   
2    3     3      0.45  agawam  massachusetts      1001        2314  525000   
3    3     2      0.36  agawam  massachusetts      1001        1276  289900   
4    4     2      0.11  agawam  massachusetts      1001        1732  275000   

   city_id  
0    12698  
1    12698  
2    12698  
3    12698  
4    12698  


In [5]:
# Reorder columns to make 'city_id' the first column
# Get a list of all columns, then filter out 'city_id'
cols = merged_df.columns.tolist()
cols = ['city_id'] + [col for col in cols if col != 'city_id']

# Reassign reordered columns back to the DataFrame
merged_df = merged_df[cols]

print(merged_df.head())

   city_id  bed  bath  acre_lot    city          state  zip_code  house_size  \
0    12698    2     1      0.34  agawam  massachusetts      1001         676   
1    12698    3     1      0.46  agawam  massachusetts      1001        1196   
2    12698    3     3      0.45  agawam  massachusetts      1001        2314   
3    12698    3     2      0.36  agawam  massachusetts      1001        1276   
4    12698    4     2      0.11  agawam  massachusetts      1001        1732   

    price  
0  180000  
1  239900  
2  525000  
3  289900  
4  275000  


In [6]:
# Drop the city and state columns as they are no longer needed
merged_df.drop(columns=['city', 'state'], inplace=True)

print(merged_df.head())

   city_id  bed  bath  acre_lot  zip_code  house_size   price
0    12698    2     1      0.34      1001         676  180000
1    12698    3     1      0.46      1001        1196  239900
2    12698    3     3      0.45      1001        2314  525000
3    12698    3     2      0.36      1001        1276  289900
4    12698    4     2      0.11      1001        1732  275000


In [7]:
# Save the updated DataFrame to a new CSV file, ready for database import
merged_df.to_csv('importable-real-estate-listings.csv', index=False)