In [53]:
#Imports
import pandas as pd
import os

In [54]:
# Read the CSV file into a DataFrame
allproperties = pd.read_csv('Data/original_data/realtor-data.zip.csv')
allproperties.dtypes

status             object
bed               float64
bath              float64
acre_lot          float64
city               object
state              object
zip_code          float64
house_size        float64
prev_sold_date     object
price             float64
dtype: object

In [55]:
# Add the 'sold_previously' column based on 'prev_sold_date'
allproperties['sold_previously'] = allproperties['prev_sold_date'].notna().astype(int)
# Drop the 'prev_sold_date' column
allproperties = allproperties.drop(columns=['prev_sold_date', 'status'])
# Fill missing values in 'bed' and 'bath' columns with 1
allproperties['bed'].fillna(1, inplace=True)
allproperties['bath'].fillna(1, inplace=True)
# Drop rows with NaNs in the 'zip_code' column
allproperties.dropna(subset=['zip_code', 'price'], inplace=True)

In [56]:
#nan_count = allproperties['house_size'].isna().sum()

#print(f"Number of NaNs in 'house_size' column: {nan_count}")

In [57]:
#new data directory
output_dir = 'Data/new_data'
os.makedirs(output_dir, exist_ok=True)

# Minimum number of rows required (15,000 in this case)
min_rows = 15000

# Loop through unique states and create separate CSV files
unique_states = allproperties['state'].unique()
for state in unique_states:
    state_df = allproperties[allproperties['state'] == state]

    # Check the number of rows in the DataFrame
    num_rows = len(state_df)

    # Only save the file if it has over 15,000 rows
    if num_rows >= min_rows:
        output_file = os.path.join(output_dir, f'{state}_properties.csv')
        state_df.to_csv(output_file, index=False)
    else:
        print(f"Skipped {state}_properties.csv with {num_rows} rows (less than 15,000).")

In [58]:
# Function to generate unique IDs
def generate_ids(df):
    state = df['state'].iloc[0][:3]  # Get the first 3 letters of the state
    df['ID'] = state + df.groupby('state').cumcount().add(1).astype(str)
    return df.set_index('ID')  # Set the 'ID' column as the index

# Directory containing the newly created CSV files
input_dir = 'Data/new_data'

# Loop through the CSV files
for filename in os.listdir(input_dir):
    if filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(input_dir, filename)
        df = pd.read_csv(filepath)
        
        # Add the ID column and set it as the index
        df = generate_ids(df)
        
        # Save the DataFrame back to the CSV file with the ID as the index
        df.to_csv(filepath)

In [60]:
# Create a text file to store the NaN counts
output_file = 'nan_counts.txt'

with open(output_file, 'w') as f:
    # Loop through the CSV files
    for filename in os.listdir(input_dir):
        if filename.endswith('.csv'):
            # Read the CSV file into a DataFrame
            filepath = os.path.join(input_dir, filename)
            df = pd.read_csv(filepath)

            # Count NaNs for each column in the DataFrame
            nan_counts = df.isna().sum()

            # Get the total number of rows
            total_rows = len(df)

            # Write the NaN counts and total rows for each file to the text file
            f.write(f"NaN counts and total rows for {filename}:\n")
            for column in df.columns:
                f.write(f"{column}: {nan_counts[column]}\n")
            f.write(f"Total Rows: {total_rows}\n")
            f.write('\n')