## Data Ingestion

In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load data with explicit encoding
data_raw_path = '../data/raw/AmesHousing.csv'
data = pd.read_csv(data_raw_path, encoding='utf-8')

# Step 2: Check dataset shape
print(f"Dataset shape: {data.shape}")

# Step 3: Display coumn names
print("Columns:", data.columns.tolist())

# Step 4: Basic info (data types, memory usage)
print("\nBasic info:")
data.info()

# Step 5: Preview first few rows
print("\nFirst 5 rows:")
print(data.head())

# Step 6: Check and remove duplicate rows
num_duplicates = data.duplicated().sum()
print(f"\nDuplicate rows: {num_duplicates}")
if num_duplicates > 0:
    data = data.drop_duplicates()

# Step 7: Reset index
data.reset_index(drop=False, inplace=True)

# Step 8 : Final cleaned data
print("\nCleaned data: ", data.shape)
print(data.head())

data_cleaned_path = '../data/processed/ingestion_AmesHousing.csv'
data.to_csv(data_cleaned_path, index=False)


Dataset shape: (2930, 82)
Columns: ['Order', 'PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street', 'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air', 'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu', 'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Cars', 'Garage Area', 'Garage Qual', 'Garag

In [39]:
data_cleaned_path = '../data/processed/ingestion_AmesHousing.csv'
data = pd.read_csv(data_cleaned_path, encoding='utf-8')
data.head()

Unnamed: 0,index,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,0,1,526301100,20,RL,141.0,31770,Pave,,IR1,...,0,,,,0,5,2010,WD,Normal,215000
1,1,2,526350040,20,RH,80.0,11622,Pave,,Reg,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,2,3,526351010,20,RL,81.0,14267,Pave,,IR1,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,3,4,526353030,20,RL,93.0,11160,Pave,,Reg,...,0,,,,0,4,2010,WD,Normal,244000
4,4,5,527105010,60,RL,74.0,13830,Pave,,IR1,...,0,,MnPrv,,0,3,2010,WD,Normal,189900
