In [2]:
import pandas as pd
import os

In [17]:
# Get the current working directory
current_directory = os.getcwd()

# Name of CSV file
csv_filename = "real-estate-listings.csv" 

# Combine the current directory path with the CSV file name to get the full file path
dataset_path = os.path.join(current_directory, csv_filename)

df = pd.read_csv(dataset_path)

In [18]:
# Remove the 'id' column
df = df.drop(columns=['id'])

In [19]:
# List of columns to convert to integers
columns_to_convert = ['bed', 'bath', 'zip_code', 'house_size', 'price']

# Convert specified columns to integer, handling non-null values correctly
for column in columns_to_convert:
    # Remove decimals by converting to integers for non-null values
    df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0).astype(int)

print(df.head())

     status  bed  bath  acre_lot        city        state  zip_code  \
0  for_sale    3     2      0.12    Adjuntas  Puerto Rico       601   
1  for_sale    4     2      0.08    Adjuntas  Puerto Rico       601   
2  for_sale    2     1      0.15  Juana Diaz  Puerto Rico       795   
3  for_sale    4     2      0.10       Ponce  Puerto Rico       731   
4  for_sale    6     2      0.05    Mayaguez  Puerto Rico       680   

   house_size prev_sold_date   price  
0         920            NaN  105000  
1        1527            NaN   80000  
2         748            NaN   67000  
3        1800            NaN  145000  
4           0            NaN   65000  


In [20]:
# Remove rows where 'price' is zero (which originally were NaNs and got converted to zero)
df = df[df['price'] != 0]

In [21]:
# Print the number of rows before cleaning
print(f"Original number of rows: {len(df)}")

# Remove rows 
df = df.dropna(subset=['city', 'state', 'acre_lot'])

# Print the number of rows after cleaning
print(f"Number of rows after: {len(df)}")

Original number of rows: 3452504
Number of rows after: 2800402


In [22]:
# Remove rows where 'house_size' is 0
df = df[df['house_size'] != 0]

# Print the number of rows after cleaning
print(f"Number of rows after: {len(df)}")

Number of rows after: 1900994


In [24]:
# Remove rows where 'zip_code' or 'bed' or 'bath' is 0
df = df[(df['zip_code'] != 0) & (df['bath'] != 0) & (df['bed'] != 0)]

# Print the number of rows after cleaning
print(f"Number of rows after: {len(df)}")

print(df.head())

Number of rows after: 1853038
     status  bed  bath  acre_lot           city        state  zip_code  \
0  for_sale    3     2      0.12       Adjuntas  Puerto Rico       601   
1  for_sale    4     2      0.08       Adjuntas  Puerto Rico       601   
2  for_sale    2     1      0.15     Juana Diaz  Puerto Rico       795   
3  for_sale    4     2      0.10          Ponce  Puerto Rico       731   
5  for_sale    4     3      0.46  San Sebastian  Puerto Rico       612   

   house_size prev_sold_date   price  
0         920            NaN  105000  
1        1527            NaN   80000  
2         748            NaN   67000  
3        1800            NaN  145000  
5        2520            NaN  179000  


In [25]:
df = df.drop(columns=['status', 'prev_sold_date'])

print(df.head())

   bed  bath  acre_lot           city        state  zip_code  house_size  \
0    3     2      0.12       Adjuntas  Puerto Rico       601         920   
1    4     2      0.08       Adjuntas  Puerto Rico       601        1527   
2    2     1      0.15     Juana Diaz  Puerto Rico       795         748   
3    4     2      0.10          Ponce  Puerto Rico       731        1800   
5    4     3      0.46  San Sebastian  Puerto Rico       612        2520   

    price  
0  105000  
1   80000  
2   67000  
3  145000  
5  179000  


In [26]:
# Convert 'city' and 'state' columns to lowercase
df['city'] = df['city'].str.lower()
df['state'] = df['state'].str.lower()

print(df.head())

   bed  bath  acre_lot           city        state  zip_code  house_size  \
0    3     2      0.12       adjuntas  puerto rico       601         920   
1    4     2      0.08       adjuntas  puerto rico       601        1527   
2    2     1      0.15     juana diaz  puerto rico       795         748   
3    4     2      0.10          ponce  puerto rico       731        1800   
5    4     3      0.46  san sebastian  puerto rico       612        2520   

    price  
0  105000  
1   80000  
2   67000  
3  145000  
5  179000  


In [27]:
# State abbreviations to full names mapping (Only full names are used here)
valid_states = {
    'alabama', 'alaska', 'arizona', 'arkansas', 'california',
    'colorado', 'connecticut', 'delaware', 'florida', 'georgia',
    'hawaii', 'idaho', 'illinois', 'indiana', 'iowa',
    'kansas', 'kentucky', 'louisiana', 'maine', 'maryland',
    'massachusetts', 'michigan', 'minnesota', 'mississippi',
    'missouri', 'montana', 'nebraska', 'nevada', 'new hampshire',
    'new jersey', 'new mexico', 'new york', 'north carolina',
    'north dakota', 'ohio', 'oklahoma', 'oregon', 'pennsylvania',
    'rhode island', 'south carolina', 'south dakota', 'tennessee',
    'texas', 'utah', 'vermont', 'virginia', 'washington',
    'west virginia', 'wisconsin', 'wyoming'
}

# Filter the DataFrame to include only rows where the state is in the set of valid states
df = df[df['state'].isin(valid_states)]

print(df.head())

Original number of rows: 1804425
Number of rows after filtering: 1804425
       bed  bath  acre_lot    city          state  zip_code  house_size  \
24201    2     1      0.34  agawam  massachusetts      1001         676   
24206    3     1      0.46  agawam  massachusetts      1001        1196   
24207    3     3      0.45  agawam  massachusetts      1001        2314   
24208    3     2      0.36  agawam  massachusetts      1001        1276   
24211    4     2      0.11  agawam  massachusetts      1001        1732   

        price  
24201  180000  
24206  239900  
24207  525000  
24208  289900  
24211  275000  


In [28]:
# Save the filtered DataFrame to a new CSV file
df.to_csv('clean-real-estate-listings.csv', index=False)

In [11]:
# Load the real estate listings data
real_estate_df = pd.read_csv('clean-real-estate-listings.csv')
# Load the city table data
city_df = pd.read_csv('city-table.csv')

# Convert city_df to a set of tuples for faster lookup
city_set = set(zip(city_df['city'], city_df['state']))

# Find entries in real_estate_df that are not in city_set
unique_cities = real_estate_df[~real_estate_df.apply(lambda row: (row['city'], row['state']) in city_set, axis=1)]

# Display unique cities and states
print("Unique city, state pairs in real estate listings that are not in the city table:")
print(unique_cities[['city', 'state']].drop_duplicates())

Unique city, state pairs in real estate listings that are not in the city table:
                      city          state
9                   Pelham  Massachusetts
40                  Hadley  Massachusetts
117       West Springfield  Massachusetts
289             Willington    Connecticut
294           East Windsor    Connecticut
...                    ...            ...
1798777          Belleview        Florida
1798779         Fort McCoy        Florida
1798917  Wilbur By the Sea        Florida
1799631        Deer Island        Florida
1800548           Ferndale        Florida

[1286 rows x 2 columns]


In [10]:
# Get the current working directory
current_directory = os.getcwd()

# Name of CSV file
csv_filename = "clean-real-estate-listings.csv" 

# Combine the current directory path with the CSV file name to get the full file path
dataset_path = os.path.join(current_directory, csv_filename)

chunk_size = 10000  # You can adjust this size based on your memory availability
chunks = pd.read_csv(dataset_path, chunksize=chunk_size)

# Load city data
city_dataset_path = os.path.join(current_directory, "city-table.csv")
city_df = pd.read_csv(city_dataset_path)
city_df.rename(columns={'id': 'city_id'}, inplace=True)

# Normalize city and state columns in city_df
city_df['city'] = city_df['city'].str.strip().str.lower()
city_df['state'] = city_df['state'].str.strip().str.lower()

# Initialize a flag to indicate the first chunk
is_first_chunk = True

# Process each chunk
for chunk in chunks:
    # Normalize city and state columns in each chunk
    chunk['city'] = chunk['city'].str.strip().str.lower()
    chunk['state'] = chunk['state'].str.strip().str.lower()
    
    merged_chunk = pd.merge(chunk, city_df, on=['city', 'state'], how='left')
    
    # Check for any entries that did not get a city_id (these will need to be addressed)
    if merged_chunk['city_id'].isnull().any():
        print("Some entries did not get a city_id. Review the unmatched city and state names.")
    
    merged_chunk.drop(columns=['city', 'state'], inplace=True)
    # Append each processed chunk to a new CSV file
    merged_chunk.to_csv('updated_real-estate-listings.csv', mode='a', index=False, header=is_first_chunk)  # Only write header for first chunk
    is_first_chunk = False  # After the first chunk, set the flag to False

print("Processing complete.")


Some entries did not get a city_id. Review the unmatched city and state names.
Some entries did not get a city_id. Review the unmatched city and state names.
Some entries did not get a city_id. Review the unmatched city and state names.
Some entries did not get a city_id. Review the unmatched city and state names.
Some entries did not get a city_id. Review the unmatched city and state names.
Some entries did not get a city_id. Review the unmatched city and state names.
Some entries did not get a city_id. Review the unmatched city and state names.
Some entries did not get a city_id. Review the unmatched city and state names.
Some entries did not get a city_id. Review the unmatched city and state names.
Some entries did not get a city_id. Review the unmatched city and state names.
Some entries did not get a city_id. Review the unmatched city and state names.
Some entries did not get a city_id. Review the unmatched city and state names.
Some entries did not get a city_id. Review the unmat