In [2]:
import pandas as pd
import os

In [3]:
# Get the current working directory
current_directory = os.getcwd()

# Name of CSV file
csv_filename = "processed-real-estate-listings.csv" 

# Combine the current directory path with the CSV file name to get the full file path
dataset_path = os.path.join(current_directory, csv_filename)

df = pd.read_csv(dataset_path)

In [None]:
# Reorder columns so that "id" column becomes the first column
columns_order = ['id'] + [col for col in df.columns if col != 'id']
df = df[columns_order]

In [4]:
# Remove the 'id' column
df = df.drop(columns=['id'])

In [7]:
# List of columns to convert to integers
columns_to_convert = ['bed', 'bath', 'zip_code', 'house_size', 'price']

# Convert specified columns to integer, handling non-null values correctly
for column in columns_to_convert:
    # Remove decimals by converting to integers for non-null values
    df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0).astype(int)

print(df.head())

   bed  bath  acre_lot        city        state  zip_code  house_size   price
0    3     2      0.12    Adjuntas  Puerto Rico       601         920  105000
1    4     2      0.08    Adjuntas  Puerto Rico       601        1527   80000
2    2     1      0.15  Juana Diaz  Puerto Rico       795         748   67000
3    4     2      0.10       Ponce  Puerto Rico       731        1800  145000
4    6     2      0.05    Mayaguez  Puerto Rico       680           0   65000


In [8]:
# Remove rows where 'price' is zero (which originally were NaNs and got converted to zero)
df = df[df['price'] != 0]

In [8]:
# Print the number of rows before cleaning
print(f"Original number of rows: {len(df)}")

# Remove rows where 'city' or 'state' is null
df = df.dropna(subset=['city', 'state', 'acre_lot'])

# Print the number of rows after cleaning
print(f"Number of rows after: {len(df)}")

Original number of rows: 2351483
Number of rows after: 1853038


In [9]:
# Remove rows where 'house_size' is 0
df = df[df['house_size'] != 0]

# Print the number of rows after cleaning
print(f"Number of rows after: {len(df)}")

Number of rows after: 1853038


In [10]:
# Remove rows where 'zip_code' or 'bed' or 'bath' is 0
df = df[(df['zip_code'] != 0) & (df['bath'] != 0) & (df['bed'] != 0)]

# Print the number of rows after cleaning
print(f"Number of rows after: {len(df)}")


Number of rows after: 1853038


In [11]:
# Save the modified dataset
df.to_csv(dataset_path, index=False)