In [153]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [154]:
train_file_path = "../data/raw/train.csv"
test_file_path = "../data/raw/test.csv"
variable_notes_path = "../data/external/variable_notes.csv"

train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)
variable_notes = pd.read_csv(variable_notes_path)

train.drop("Id", axis=1, inplace=True)

In [155]:
# Process variable notes for category mapping 
category_mapping = variable_notes.set_index("Column")["category"].fillna("Unknown").to_dict()  
from collections import defaultdict  
category_columns = defaultdict(list)  
for col, cat in category_mapping.items():  
    category_columns[cat].append(col)  

# Print feature categories and associated columns
print("\nFeature Categories and Associated Columns:")
for category, cols in category_columns.items():
    print(f"\n{category}:\n  - " + "\n  - ".join(cols))  


Feature Categories and Associated Columns:

basement:
  - BsmtFinSF1
  - BsmtUnfSF
  - BsmtCond
  - BsmtExposure
  - BsmtFinType1
  - BsmtFinType2
  - BsmtFullBath
  - BsmtHalfBath
  - BsmtQual
  - BsmtFinSF2
  - TotalBsmtSF

bath:
  - FullBath
  - HalfBath

exterior:
  - Exterior1st
  - Exterior2nd
  - MasVnrType
  - ExterCond
  - ExterQual
  - MasVnrArea
  - PavedDrive
  - WoodDeckSF

fireplace:
  - Fireplaces

garage:
  - GarageCond
  - GarageFinish
  - GarageQual
  - GarageType
  - GarageYrBlt
  - GarageArea
  - GarageCars

general:
  - MSSubClass
  - HouseStyle
  - MSZoning
  - Functional
  - MiscVal
  - Neighborhood
  - OverallCond
  - OverallQual
  - Utilities
  - YearRemodAdd

high missing:
  - FireplaceQu
  - Alley
  - Fence
  - MiscFeature
  - PoolQC

interior:
  - BldgType
  - CentralAir
  - Heating
  - Electrical
  - HeatingQC
  - 1stFlrSF
  - 2ndFlrSF
  - BedroomAbvGr
  - GrLivArea
  - LowQualFinSF
  - TotRmsAbvGrd

kitchen:
  - KitchenQual
  - KitchenAbvGr

lot:
  - LotC

In [156]:
def display_column_info(df, column):
    """Display type and unique values of a column in the dataframe."""
    if column not in df.columns:
        print(f"Column '{column}' not found in dataframe.")
        return
    
    values = df[column].unique()
    print(f"\nColumn: {column}\nType: {df[column].dtype}\nUnique Values: " +
          (", ".join(map(str, values[:10])) + "..." if len(values) > 10 else ", ".join(map(str, values))))
    print("-" * 50)

In [158]:
from IPython.display import display, HTML

def display_category_info(df, category):
    """Display details and correlations for variables within a specified category in a prettier, more readable format."""
    if category not in category_columns:
        display(HTML(f"<p style='color:red; font-weight:bold;'>Category '{category}' not found.</p>"))
        return

    cols = category_columns[category]
    
    # Create a formatted header using HTML
    header_html = f"""
    <div style="border: 2px solid #444; padding: 10px; margin-bottom: 10px; background-color: #f2f2f2;">
        <h2 style="margin: 0; color: #444;">Category: {category.upper()}</h2>
    </div>
    """
    display(HTML(header_html))
    
    # Build a DataFrame listing each column with its correlation to 'SalePrice'
    corr_values = []
    if "SalePrice" in df.columns:
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in cols:
            if col in numeric_cols:
                corr = df[col].corr(df["SalePrice"])
                corr_values.append(corr)
            else:
                corr_values.append(np.nan)
    else:
        corr_values = [np.nan] * len(cols)
    
    cols_corr_df = pd.DataFrame({
        "Column": cols,
        "Correlation with SalePrice": [f"{val:.3f}" if pd.notnull(val) else "--" for val in corr_values]
    })
    
    styled_table = cols_corr_df.style.set_table_styles(
        [{
            'selector': 'th',
            'props': [('background-color', '#f7f7f7'),
                      ('color', '#333'),
                      ('font-weight', 'bold'),
                      ('padding', '5px')]
        },
        {
            'selector': 'td',
            'props': [('padding', '5px')]
        }]
    ).set_caption("Columns and Correlations")
    
    display(styled_table)
    
    # Display detailed information for each column
    detail_header = """
    <h3 style="color:#444; border-bottom:1px solid #ccc; margin-bottom:10px;">
        Detailed Column Information:
    </h3>
    """
    display(HTML(detail_header))
    
    for col in cols:
        sub_header = f"""
        <h4 style="color:#666; margin-bottom:0;">{col}</h4>
        <hr style="border:0; border-bottom:1px solid #ccc; margin:5px 0 10px;">
        """
        display(HTML(sub_header))
        display_column_info(df, col)


In [159]:
categories_to_display = ["basement", "bath", "exterior", "fireplace", "garage",
                         "general", "interior", "kitchen", "lot", "overall",
                         "pool", "porch", "roof", "sale_info"]  

for cat in categories_to_display:  
    display_category_info(train, cat)  

Unnamed: 0,Column,Correlation with SalePrice
0,BsmtFinSF1,0.386
1,BsmtUnfSF,0.214
2,BsmtCond,--
3,BsmtExposure,--
4,BsmtFinType1,--
5,BsmtFinType2,--
6,BsmtFullBath,0.227
7,BsmtHalfBath,-0.017
8,BsmtQual,--
9,BsmtFinSF2,-0.011



Column: BsmtFinSF1
Type: int64
Unique Values: 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851...
--------------------------------------------------



Column: BsmtUnfSF
Type: int64
Unique Values: 150, 284, 434, 540, 490, 64, 317, 216, 952, 140...
--------------------------------------------------



Column: BsmtCond
Type: object
Unique Values: TA, Gd, nan, Fa, Po
--------------------------------------------------



Column: BsmtExposure
Type: object
Unique Values: No, Gd, Mn, Av, nan
--------------------------------------------------



Column: BsmtFinType1
Type: object
Unique Values: GLQ, ALQ, Unf, Rec, BLQ, nan, LwQ
--------------------------------------------------



Column: BsmtFinType2
Type: object
Unique Values: Unf, BLQ, nan, ALQ, Rec, LwQ, GLQ
--------------------------------------------------



Column: BsmtFullBath
Type: int64
Unique Values: 1, 0, 2, 3
--------------------------------------------------



Column: BsmtHalfBath
Type: int64
Unique Values: 0, 1, 2
--------------------------------------------------



Column: BsmtQual
Type: object
Unique Values: Gd, TA, Ex, nan, Fa
--------------------------------------------------



Column: BsmtFinSF2
Type: int64
Unique Values: 0, 32, 668, 486, 93, 491, 506, 712, 362, 41...
--------------------------------------------------



Column: TotalBsmtSF
Type: int64
Unique Values: 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991...
--------------------------------------------------


Unnamed: 0,Column,Correlation with SalePrice
0,FullBath,0.561
1,HalfBath,0.284



Column: FullBath
Type: int64
Unique Values: 2, 1, 3, 0
--------------------------------------------------



Column: HalfBath
Type: int64
Unique Values: 1, 0, 2
--------------------------------------------------


Unnamed: 0,Column,Correlation with SalePrice
0,Exterior1st,--
1,Exterior2nd,--
2,MasVnrType,--
3,ExterCond,--
4,ExterQual,--
5,MasVnrArea,0.477
6,PavedDrive,--
7,WoodDeckSF,0.324



Column: Exterior1st
Type: object
Unique Values: VinylSd, MetalSd, Wd Sdng, HdBoard, BrkFace, WdShing, CemntBd, Plywood, AsbShng, Stucco...
--------------------------------------------------



Column: Exterior2nd
Type: object
Unique Values: VinylSd, MetalSd, Wd Shng, HdBoard, Plywood, Wd Sdng, CmentBd, BrkFace, Stucco, AsbShng...
--------------------------------------------------



Column: MasVnrType
Type: object
Unique Values: BrkFace, nan, Stone, BrkCmn
--------------------------------------------------



Column: ExterCond
Type: object
Unique Values: TA, Gd, Fa, Po, Ex
--------------------------------------------------



Column: ExterQual
Type: object
Unique Values: Gd, TA, Ex, Fa
--------------------------------------------------



Column: MasVnrArea
Type: float64
Unique Values: 196.0, 0.0, 162.0, 350.0, 186.0, 240.0, 286.0, 306.0, 212.0, 180.0...
--------------------------------------------------



Column: PavedDrive
Type: object
Unique Values: Y, N, P
--------------------------------------------------



Column: WoodDeckSF
Type: int64
Unique Values: 0, 298, 192, 40, 255, 235, 90, 147, 140, 160...
--------------------------------------------------


Unnamed: 0,Column,Correlation with SalePrice
0,Fireplaces,0.467



Column: Fireplaces
Type: int64
Unique Values: 0, 1, 2, 3
--------------------------------------------------


Unnamed: 0,Column,Correlation with SalePrice
0,GarageCond,--
1,GarageFinish,--
2,GarageQual,--
3,GarageType,--
4,GarageYrBlt,0.486
5,GarageArea,0.623
6,GarageCars,0.640



Column: GarageCond
Type: object
Unique Values: TA, Fa, nan, Gd, Po, Ex
--------------------------------------------------



Column: GarageFinish
Type: object
Unique Values: RFn, Unf, Fin, nan
--------------------------------------------------



Column: GarageQual
Type: object
Unique Values: TA, Fa, Gd, nan, Ex, Po
--------------------------------------------------



Column: GarageType
Type: object
Unique Values: Attchd, Detchd, BuiltIn, CarPort, nan, Basment, 2Types
--------------------------------------------------



Column: GarageYrBlt
Type: float64
Unique Values: 2003.0, 1976.0, 2001.0, 1998.0, 2000.0, 1993.0, 2004.0, 1973.0, 1931.0, 1939.0...
--------------------------------------------------



Column: GarageArea
Type: int64
Unique Values: 548, 460, 608, 642, 836, 480, 636, 484, 468, 205...
--------------------------------------------------



Column: GarageCars
Type: int64
Unique Values: 2, 3, 1, 0, 4
--------------------------------------------------


Unnamed: 0,Column,Correlation with SalePrice
0,MSSubClass,-0.084
1,HouseStyle,--
2,MSZoning,--
3,Functional,--
4,MiscVal,-0.021
5,Neighborhood,--
6,OverallCond,-0.078
7,OverallQual,0.791
8,Utilities,--
9,YearRemodAdd,0.507



Column: MSSubClass
Type: int64
Unique Values: 60, 20, 70, 50, 190, 45, 90, 120, 30, 85...
--------------------------------------------------



Column: HouseStyle
Type: object
Unique Values: 2Story, 1Story, 1.5Fin, 1.5Unf, SFoyer, SLvl, 2.5Unf, 2.5Fin
--------------------------------------------------



Column: MSZoning
Type: object
Unique Values: RL, RM, C (all), FV, RH
--------------------------------------------------



Column: Functional
Type: object
Unique Values: Typ, Min1, Maj1, Min2, Mod, Maj2, Sev
--------------------------------------------------



Column: MiscVal
Type: int64
Unique Values: 0, 700, 350, 500, 400, 480, 450, 15500, 1200, 800...
--------------------------------------------------



Column: Neighborhood
Type: object
Unique Values: CollgCr, Veenker, Crawfor, NoRidge, Mitchel, Somerst, NWAmes, OldTown, BrkSide, Sawyer...
--------------------------------------------------



Column: OverallCond
Type: int64
Unique Values: 5, 8, 6, 7, 4, 2, 3, 9, 1
--------------------------------------------------



Column: OverallQual
Type: int64
Unique Values: 7, 6, 8, 5, 9, 4, 10, 3, 1, 2
--------------------------------------------------



Column: Utilities
Type: object
Unique Values: AllPub, NoSeWa
--------------------------------------------------



Column: YearRemodAdd
Type: int64
Unique Values: 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 1965...
--------------------------------------------------


Unnamed: 0,Column,Correlation with SalePrice
0,BldgType,--
1,CentralAir,--
2,Heating,--
3,Electrical,--
4,HeatingQC,--
5,1stFlrSF,0.606
6,2ndFlrSF,0.319
7,BedroomAbvGr,0.168
8,GrLivArea,0.709
9,LowQualFinSF,-0.026



Column: BldgType
Type: object
Unique Values: 1Fam, 2fmCon, Duplex, TwnhsE, Twnhs
--------------------------------------------------



Column: CentralAir
Type: object
Unique Values: Y, N
--------------------------------------------------



Column: Heating
Type: object
Unique Values: GasA, GasW, Grav, Wall, OthW, Floor
--------------------------------------------------



Column: Electrical
Type: object
Unique Values: SBrkr, FuseF, FuseA, FuseP, Mix, nan
--------------------------------------------------



Column: HeatingQC
Type: object
Unique Values: Ex, Gd, TA, Fa, Po
--------------------------------------------------



Column: 1stFlrSF
Type: int64
Unique Values: 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077...
--------------------------------------------------



Column: 2ndFlrSF
Type: int64
Unique Values: 854, 0, 866, 756, 1053, 566, 983, 752, 1142, 1218...
--------------------------------------------------



Column: BedroomAbvGr
Type: int64
Unique Values: 3, 4, 1, 2, 0, 5, 6, 8
--------------------------------------------------



Column: GrLivArea
Type: int64
Unique Values: 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 1077...
--------------------------------------------------



Column: LowQualFinSF
Type: int64
Unique Values: 0, 360, 513, 234, 528, 572, 144, 392, 371, 390...
--------------------------------------------------



Column: TotRmsAbvGrd
Type: int64
Unique Values: 8, 6, 7, 9, 5, 11, 4, 10, 12, 3...
--------------------------------------------------


Unnamed: 0,Column,Correlation with SalePrice
0,KitchenQual,--
1,KitchenAbvGr,-0.136



Column: KitchenQual
Type: object
Unique Values: Gd, TA, Ex, Fa
--------------------------------------------------



Column: KitchenAbvGr
Type: int64
Unique Values: 1, 2, 3, 0
--------------------------------------------------


Unnamed: 0,Column,Correlation with SalePrice
0,LotConfig,--
1,Condition1,--
2,Condition2,--
3,LandContour,--
4,LandSlope,--
5,LotShape,--
6,LotFrontage,0.352
7,LotArea,0.264
8,Street,--



Column: LotConfig
Type: object
Unique Values: Inside, FR2, Corner, CulDSac, FR3
--------------------------------------------------



Column: Condition1
Type: object
Unique Values: Norm, Feedr, PosN, Artery, RRAe, RRNn, RRAn, PosA, RRNe
--------------------------------------------------



Column: Condition2
Type: object
Unique Values: Norm, Artery, RRNn, Feedr, PosN, PosA, RRAn, RRAe
--------------------------------------------------



Column: LandContour
Type: object
Unique Values: Lvl, Bnk, Low, HLS
--------------------------------------------------



Column: LandSlope
Type: object
Unique Values: Gtl, Mod, Sev
--------------------------------------------------



Column: LotShape
Type: object
Unique Values: Reg, IR1, IR2, IR3
--------------------------------------------------



Column: LotFrontage
Type: float64
Unique Values: 65.0, 80.0, 68.0, 60.0, 84.0, 85.0, 75.0, nan, 51.0, 50.0...
--------------------------------------------------



Column: LotArea
Type: int64
Unique Values: 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 6120, 7420...
--------------------------------------------------



Column: Street
Type: object
Unique Values: Pave, Grvl
--------------------------------------------------


Unnamed: 0,Column,Correlation with SalePrice
0,Foundation,--
1,YearBuilt,0.523



Column: Foundation
Type: object
Unique Values: PConc, CBlock, BrkTil, Wood, Slab, Stone
--------------------------------------------------



Column: YearBuilt
Type: int64
Unique Values: 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 1939...
--------------------------------------------------


Unnamed: 0,Column,Correlation with SalePrice
0,PoolArea,0.092



Column: PoolArea
Type: int64
Unique Values: 0, 512, 648, 576, 555, 480, 519, 738
--------------------------------------------------


Unnamed: 0,Column,Correlation with SalePrice
0,EnclosedPorch,-0.129
1,OpenPorchSF,0.316
2,ScreenPorch,0.111
3,3SsnPorch,0.045



Column: EnclosedPorch
Type: int64
Unique Values: 0, 272, 228, 205, 176, 87, 172, 102, 37, 144...
--------------------------------------------------



Column: OpenPorchSF
Type: int64
Unique Values: 61, 0, 42, 35, 84, 30, 57, 204, 4, 21...
--------------------------------------------------



Column: ScreenPorch
Type: int64
Unique Values: 0, 176, 198, 291, 252, 99, 184, 168, 130, 142...
--------------------------------------------------



Column: 3SsnPorch
Type: int64
Unique Values: 0, 320, 407, 130, 180, 168, 140, 508, 238, 245...
--------------------------------------------------


Unnamed: 0,Column,Correlation with SalePrice
0,RoofMatl,--
1,RoofStyle,--



Column: RoofMatl
Type: object
Unique Values: CompShg, WdShngl, Metal, WdShake, Membran, Tar&Grv, Roll, ClyTile
--------------------------------------------------



Column: RoofStyle
Type: object
Unique Values: Gable, Hip, Gambrel, Mansard, Flat, Shed
--------------------------------------------------


Unnamed: 0,Column,Correlation with SalePrice
0,MoSold,0.046
1,SaleCondition,--
2,SaleType,--
3,YrSold,-0.029



Column: MoSold
Type: int64
Unique Values: 2, 5, 9, 12, 10, 8, 11, 4, 1, 7...
--------------------------------------------------



Column: SaleCondition
Type: object
Unique Values: Normal, Abnorml, Partial, AdjLand, Alloca, Family
--------------------------------------------------



Column: SaleType
Type: object
Unique Values: WD, New, COD, ConLD, ConLI, CWD, ConLw, Con, Oth
--------------------------------------------------



Column: YrSold
Type: int64
Unique Values: 2008, 2007, 2006, 2009, 2010
--------------------------------------------------
