In [140]:
#Imports
import pandas as pd
import os
from sklearn.linear_model import LinearRegression


In [141]:
# Read the CSV file into a DataFrame
allproperties = pd.read_csv('Data/original_data/realtor-data.zip.csv')
#allproperties.dtypes

In [142]:
# Add the 'sold_previously' column based on 'prev_sold_date'
allproperties['sold_previously'] = allproperties['prev_sold_date'].notna().astype(int)
# Drop the 'prev_sold_date' column
allproperties = allproperties.drop(columns=['prev_sold_date', 'status'])
# Fill missing values in 'bed' and 'bath' columns with 1
allproperties['bed'].fillna(1, inplace=True)
allproperties['bath'].fillna(1, inplace=True)
# Drop rows with NaNs in the 'zip_code' column
allproperties.dropna(subset=['zip_code', 'price', 'city'], inplace=True)

In [143]:
#new data directory
output_dir = 'Data/new_data'
os.makedirs(output_dir, exist_ok=True)

# Minimum number of rows required (15,000 in this case)
min_rows = 15000

# Loop through unique states and create separate CSV files
unique_states = allproperties['state'].unique()
for state in unique_states:
    state_df = allproperties[allproperties['state'] == state]

    # Check the number of rows in the DataFrame
    num_rows = len(state_df)

    # Only save the file if it has over 15,000 rows
    if num_rows >= min_rows:
        output_file = os.path.join(output_dir, f'{state}_properties.csv')
        state_df.to_csv(output_file, index=False)
    else:
        print(f"Skipped {state}_properties.csv with {num_rows} rows (less than 15,000).")

Skipped Virgin Islands_properties.csv with 2522 rows (less than 15,000).
Skipped South Carolina_properties.csv with 25 rows (less than 15,000).
Skipped Tennessee_properties.csv with 20 rows (less than 15,000).
Skipped Virginia_properties.csv with 31 rows (less than 15,000).
Skipped Wyoming_properties.csv with 3 rows (less than 15,000).
Skipped Georgia_properties.csv with 50 rows (less than 15,000).
Skipped West Virginia_properties.csv with 3 rows (less than 15,000).
Skipped Delaware_properties.csv with 2129 rows (less than 15,000).


In [151]:
# Directory containing the newly created CSV files
input_dir = 'Data/new_data'

# Function to generate unique IDs
def generate_ids(df):
    state = df['state'].iloc[0][:5].replace(' ', '')  # Get the first 4 letters of the state
    df['ID'] = state + df.groupby('state').cumcount().add(1).astype(str)
    return df.set_index('ID')  # Set the 'ID' column as the index

# Loop through the CSV files
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(input_dir, filename)
        df = pd.read_csv(filepath)
        
        # Add the ID column and set it as the index
        df = generate_ids(df)
        
        # Save the DataFrame back to the CSV file with the ID as the index
        df.to_csv(filepath)

In [145]:
# Loop through the CSV files
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(input_dir, filename)
        df = pd.read_csv(filepath)

        # Check the number of rows in the DataFrame
        num_rows = len(df)

        # Only fill NaN values for files with over 15,000 rows
        if num_rows >= min_rows:
            # Filter data with non-NaN 'house_size'
            valid_data = df.dropna(subset=['house_size'])

            # Separate features (bed, bath) and target (house_size)
            X = valid_data[['bed', 'bath']]
            y = valid_data['house_size']

            # Train a linear regression model
            model = LinearRegression()
            model.fit(X, y)

            # Predict 'house_size' for rows with NaNs
            nan_data = df[pd.isna(df['house_size'])]
            if not nan_data.empty:
                nan_X = nan_data[['bed', 'bath']]
                nan_data['house_size'] = model.predict(nan_X)

                # Round the predicted 'house_size' values to the nearest whole number
                nan_data['house_size'] = nan_data['house_size'].round()

                # Update the original DataFrame with filled 'house_size'
                df.update(nan_data)

                # Save the updated DataFrame back to the same CSV file
                df.to_csv(filepath, index=False)
                print(f'Filled "house_size" for {filename}.')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = nan_data['house_size'].round()


Filled "house_size" for Connecticut_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = nan_data['house_size'].round()


Filled "house_size" for Maine_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = nan_data['house_size'].round()


Filled "house_size" for Massachusetts_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = nan_data['house_size'].round()


Filled "house_size" for New Hampshire_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = nan_data['house_size'].round()


Filled "house_size" for New Jersey_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = nan_data['house_size'].round()


Filled "house_size" for New York_properties.csv.
Filled "house_size" for Pennsylvania_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = nan_data['house_size'].round()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a D

Filled "house_size" for Puerto Rico_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = nan_data['house_size'].round()


Filled "house_size" for Rhode Island_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['house_size'] = nan_data['house_size'].round()


Filled "house_size" for Vermont_properties.csv.


In [146]:
# Loop through the CSV files
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(input_dir, filename)
        df = pd.read_csv(filepath)

        # Check the number of rows in the DataFrame
        num_rows = len(df)

        # Only fill NaN values for files with over 15,000 rows
        if num_rows >= min_rows:
            # Filter data with non-NaN 'acre_lot'
            valid_data = df.dropna(subset=['acre_lot'])

            # Separate features ('house_size', 'bed', 'bath') and target ('acre_lot')
            X = valid_data[['house_size', 'bed', 'bath']]
            y = valid_data['acre_lot']

            # Train a linear regression model
            model = LinearRegression()
            model.fit(X, y)

            # Predict 'acre_lot' for rows with NaNs
            nan_data = df[pd.isna(df['acre_lot'])]
            if not nan_data.empty:
                nan_X = nan_data[['house_size', 'bed', 'bath']]
                nan_data['acre_lot'] = model.predict(nan_X)

                # Round the predicted 'acre_lot' values to one decimal place
                nan_data['acre_lot'] = nan_data['acre_lot'].round(1)

                # Update the original DataFrame with filled 'acre_lot'
                df.update(nan_data)

                # Save the updated DataFrame back to the same CSV file
                df.to_csv(filepath, index=False)
                print(f'Filled "acre_lot" for {filename}.')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = nan_data['acre_lot'].round(1)


Filled "acre_lot" for Connecticut_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = nan_data['acre_lot'].round(1)


Filled "acre_lot" for Maine_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = nan_data['acre_lot'].round(1)


Filled "acre_lot" for Massachusetts_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = nan_data['acre_lot'].round(1)


Filled "acre_lot" for New Hampshire_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = nan_data['acre_lot'].round(1)


Filled "acre_lot" for New Jersey_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = nan_data['acre_lot'].round(1)


Filled "acre_lot" for New York_properties.csv.
Filled "acre_lot" for Pennsylvania_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = nan_data['acre_lot'].round(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFram

Filled "acre_lot" for Puerto Rico_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = nan_data['acre_lot'].round(1)


Filled "acre_lot" for Rhode Island_properties.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = model.predict(nan_X)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nan_data['acre_lot'] = nan_data['acre_lot'].round(1)


Filled "acre_lot" for Vermont_properties.csv.


In [147]:
# Create a text file to store the NaN counts. This was used for understanding the dataset.
# Since the data set has been cleaned, it's no longer needed. Leaving it as a reference of our work.
output_file = 'nan_counts.txt'

with open(output_file, 'w') as f:
    # Loop through the CSV files
    for filename in os.listdir(input_dir):
        if filename.endswith('.csv'):
            # Read the CSV file into a DataFrame
            filepath = os.path.join(input_dir, filename)
            df = pd.read_csv(filepath)

            # Count NaNs for each column in the DataFrame
            nan_counts = df.isna().sum()

            # Get the total number of rows
            total_rows = len(df)

            # Write the NaN counts and total rows for each file to the text file
            f.write(f"NaN counts and total rows for {filename}:\n")
            for column in df.columns:
                f.write(f"{column}: {nan_counts[column]}\n")
            f.write(f"Total Rows: {total_rows}\n")
            f.write('\n')

