# Cleaning Data

In [1]:
import pandas as pd
import numpy as np

# Read the CSV file
df = pd.read_csv('/workspaces/COM6005/project/AmesHousing.csv')

# Filter for new sales
df_new = df[df['Sale Type'] == 'New']

# Create a new dataframe with the specified variables
summary_df = pd.DataFrame()

# V-1: Project locality (Neighborhood)
summary_df['V-1_Project_Locality'] = df_new['Neighborhood']

# V-2: Total floor area (m²)
# Converting from sq ft to m² (1 sq ft = 0.092903 m²)
summary_df['V-2_Total_Floor_Area'] = (df_new['Gr Liv Area'] + df_new['Total Bsmt SF']) * 0.092903

# V-3: Lot area (m²)
summary_df['V-3_Lot_Area'] = df_new['Lot Area'] * 0.092903  # Converting to m²

# V-7: Duration of construction
summary_df['V-7_Construction_Duration'] = df_new['Year Remod/Add'] - df_new['Year Built'] + 1

# V-9: Actual sales prices
summary_df['V-9_Sale_Price'] = df_new['SalePrice']

# Year Sold
summary_df['Year_Sold'] = df_new['Yr Sold']

# Generate summary statistics
summary_stats = pd.DataFrame({
    'Variable': ['V-1_Project_Locality', 'V-2_Total_Floor_Area', 'V-3_Lot_Area', 
                'V-7_Construction_Duration', 'V-9_Sale_Price', 'Year_Sold'],
    'Count': [
        summary_df['V-1_Project_Locality'].count(),
        summary_df['V-2_Total_Floor_Area'].count(),
        summary_df['V-3_Lot_Area'].count(),
        summary_df['V-7_Construction_Duration'].count(),
        summary_df['V-9_Sale_Price'].count(),
        summary_df['Year_Sold'].count()
    ],
    'Mean': [
        'N/A',
        summary_df['V-2_Total_Floor_Area'].mean(),
        summary_df['V-3_Lot_Area'].mean(),
        summary_df['V-7_Construction_Duration'].mean(),
        summary_df['V-9_Sale_Price'].mean(),
        summary_df['Year_Sold'].mean()
    ],
    'Std': [
        'N/A',
        summary_df['V-2_Total_Floor_Area'].std(),
        summary_df['V-3_Lot_Area'].std(),
        summary_df['V-7_Construction_Duration'].std(),
        summary_df['V-9_Sale_Price'].std(),
        summary_df['Year_Sold'].std()
    ],
    'Min': [
        'N/A',
        summary_df['V-2_Total_Floor_Area'].min(),
        summary_df['V-3_Lot_Area'].min(),
        summary_df['V-7_Construction_Duration'].min(),
        summary_df['V-9_Sale_Price'].min(),
        summary_df['Year_Sold'].min()
    ],
    'Max': [
        'N/A',
        summary_df['V-2_Total_Floor_Area'].max(),
        summary_df['V-3_Lot_Area'].max(),
        summary_df['V-7_Construction_Duration'].max(),
        summary_df['V-9_Sale_Price'].max(),
        summary_df['Year_Sold'].max()
    ]
})

# Save the processed data to CSV files
summary_df.to_csv('AmesHousing_clean.csv', index=False)
summary_stats.to_csv('AmesHousing_clean_summary.csv', index=False)

# Print summary information
print("\nData Summary (New Sales Only):")
print("="*50)
print(f"Total number of new sale records: {len(summary_df)}")
print("\nNeighborhood Distribution for New Sales:")
print(summary_df['V-1_Project_Locality'].value_counts())
print("\nSummary Statistics:")
print(summary_stats)

# Print year-wise distribution of new sales
print("\nYear-wise Distribution of New Sales:")
print(summary_df['Year_Sold'].value_counts().sort_index())

  df = pd.read_csv('/workspaces/COM6005/project/AmesHousing.csv')



Data Summary (New Sales Only):
Total number of new sale records: 239

Neighborhood Distribution for New Sales:
V-1_Project_Locality
NridgHt    63
Somerst    60
CollgCr    32
Gilbert    27
Timber     15
StoneBr    15
Blmngtn     9
Edwards     6
SawyerW     5
Crawfor     5
OldTown     1
Mitchel     1
Name: count, dtype: int64

Summary Statistics:
                    Variable  Count           Mean           Std         Min  \
0       V-1_Project_Locality    239            N/A           N/A         N/A   
1       V-2_Total_Floor_Area    239       300.4277     97.977733  150.409957   
2               V-3_Lot_Area    239    1022.171671    581.315376     278.709   
3  V-7_Construction_Duration    239       1.853556      5.684224         1.0   
4             V-9_Sale_Price    239  275751.309623  99488.530609    113000.0   
5                  Year_Sold    239    2007.125523      1.159967      2006.0   

           Max  
0          N/A  
1  1091.796056  
2  5935.293961  
3         89.0  
4     