In [1]:
# Load the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [20]:
# Read csv

data = pd.read_csv("attributes_unclass.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,ACCOUNTNO,Parcelnumb,ACCTTYPE,X.CENTROID,Y.CENTROID,NUMBEROFUNITS,BLTASSTORIES,ROOFTYPE,ROOFCOVER,...,MASONRY.SHED.SQFT,METAL.SHED.SQFT,OPEN.SHED.SQFT,RES.BARN.AVERAGE.SQFT,RES.BARN.GOOD.SQFT,RES.BARN.LOW.SQFT,Solar.Room.SQFT,Wood.Deck.SQFT,WOOD.SHED.SQFT,WOODBURNING.STOVE.COUNT
0,1,R007213,379900000022,Agricultural,2737550.838,1238081.141,1.0,2.0,Cross Gabled or Hip/Gable,Cedar Shingle,...,,,300.0,,,,,,,
1,2,R032396,325700002044,Residential,2596049.988,1351554.456,1.0,2.0,Cross Gabled or Hip/Gable,Cedar Shingle,...,,,,,,,,1528.0,,9.0
2,3,R001798,370136351003,Residential,2590939.797,1264708.817,1.0,1.0,Cross Gabled or Hip/Gable,Seamed Metal - Painted,...,,,,,,,,340.0,,3.0
3,4,R004887,317723102001,Residential,2583598.549,1395667.36,1.0,2.0,Cross Gabled or Hip/Gable,Seamed Metal - Painted,...,,,,,,,,1725.0,,3.0
4,5,R014782,425101000002,Residential,2432848.938,1168297.398,1.0,2.0,Gable or Hip,Corrugated Metal,...,,,32.0,,,,,64.0,90.0,3.0


In [3]:
# CLASSIFYING SIDING AND ROOF MATERIALS AS COMBUSTIBLE OR NON-COMBUSTIBLE

#### ROOFING ####------------------------------------------------------------------------
# Map ROOFCOVER types to classes
roof_classes = {
    'Combustible': ['Cedar Shingle', 
                    'Wood Shingle', 
                    'Other'],
    'Non_Combustible': ['Corrugated Metal', 
                        'Preformed Metal', 
                        'Standing Seam - Painted',
                        'Seamed Metal - Rust Detail',
                        'Fiber Cement Tile',
                        'Composition - Asphalt Roll',
                        'Metal',
                        'Seamed Metal',
                        'Metal Ribbed',
                        'Corrugated',
                        'Seamed Metal - Painted',
                        'Seamed Metal - Galvanized',
                        'Composition - Asphalt Shingle',
                        'Composition Roll',
                        'Metal Tile',
                        'Standing Seam - Rusted',
                        'Composition Shingle',
                        'Standing Seam',
                        'METAL'
                       ],
    'NAN' : [''
            ]
}

# Create a new column "roof_combust" based on the mapping
data['roof_combust'] = 0  # Initialize to 0 (non-combustible)

# Set NaN to specific ROOFCOVER values
data.loc[data['ROOFCOVER'].isin(roof_classes['NAN']), 'roof_combust'] = pd.NA

# Set combustible roof types to 1
data.loc[data['ROOFCOVER'].isin(roof_classes['Combustible']), 'roof_combust'] = 1
    
#### SIDING ####--------------------------------------------------------------------
siding_classes = {
    'Combustible' : ["Logs - Sheared or Milled",
                     "Logs - Full",
                     "Stucco & Wood or Log",
                     "Synthetic Wood (Particle Bd)",
                     "Wood Sheet (Plywood, T1-11)",
                     "Wood Boards (Clapboard, etc)",
                     "Stone & Wood or Log",
                     "Vinyl / Metal",
                     "Other",
                     "High-end Hardwood or Barnwood",
                     "Brick & Wood or Log"
                    ],
    'Non_Combustible' : ['Stucco',
                         'Stucco & Stone',
                         'Brick',
                         'Stone',
                         'Fiber-Cement'
                        ],
    'NAN' : ['',
             'Allowance'
            ]
}
# Create a new column "siding_combust" and initialize to 1 (combustible)
data['siding_combust'] = 1

# Set the value to NaN for rows with specific 'SIDING' values
data.loc[data['SIDING'].isin(siding_classes['NAN']), 'siding_combust'] = pd.NA

# Define conditions for non-combustible siding
non_combustible_conditions = [
    data['SIDING'].isin(siding_classes['Non_Combustible']),
    data['SIDINGCOMMERCIAL'].eq('Metal Siding')
]

# Set the value to 0 for rows meeting non-combustible conditions
data.loc[pd.concat(non_combustible_conditions, axis=1).any(axis=1), 'siding_combust'] = 0

# Set new columns to type `Int64`
data['roof_combust'] = pd.to_numeric(data['roof_combust'], errors='coerce').astype('Int64') # Roof
data['siding_combust'] = pd.to_numeric(data['siding_combust'], errors='coerce').astype('Int64') # Siding

attribute_data = data.copy()
attribute_data.head()

Unnamed: 0.1,Unnamed: 0,ACCOUNTNO,Parcelnumb,ACCTTYPE,X.CENTROID,Y.CENTROID,NUMBEROFUNITS,BLTASSTORIES,ROOFTYPE,ROOFCOVER,...,OPEN.SHED.SQFT,RES.BARN.AVERAGE.SQFT,RES.BARN.GOOD.SQFT,RES.BARN.LOW.SQFT,Solar.Room.SQFT,Wood.Deck.SQFT,WOOD.SHED.SQFT,WOODBURNING.STOVE.COUNT,roof_combust,siding_combust
0,1,R007213,379900000022,Agricultural,2737550.838,1238081.141,1.0,2.0,Cross Gabled or Hip/Gable,Cedar Shingle,...,300.0,,,,,,,,1,1
1,2,R032396,325700002044,Residential,2596049.988,1351554.456,1.0,2.0,Cross Gabled or Hip/Gable,Cedar Shingle,...,,,,,,1528.0,,9.0,1,1
2,3,R001798,370136351003,Residential,2590939.797,1264708.817,1.0,1.0,Cross Gabled or Hip/Gable,Seamed Metal - Painted,...,,,,,,340.0,,3.0,0,1
3,4,R004887,317723102001,Residential,2583598.549,1395667.36,1.0,2.0,Cross Gabled or Hip/Gable,Seamed Metal - Painted,...,,,,,,1725.0,,3.0,0,1
4,5,R014782,425101000002,Residential,2432848.938,1168297.398,1.0,2.0,Gable or Hip,Corrugated Metal,...,32.0,,,,,64.0,90.0,3.0,0,1


In [7]:
columns_to_drop = ['Unnamed: 0',
                                 'ACCOUNTNO',
                                 'SIDING',
                                 'SIDINGCOMMERCIAL',
                                 'ROOFTYPE',
                                 'ROOFCOVER',
                                 'BEDROOMCOUNT',
                                 'BATHCOUNT',
                                 'BLTASSTORIES',
                                 'CONSTRUCTIONSTYLE',
                                 'Detached.Unfin.Area',
                                 'Covered.Porch.SQFT',
                                 'Enclosed.Porch.SQFT',
                                 'Attached.Garage.SQFT',
                                 'Built.In.Garage.SQFT',
                                 'GREENHOUSE.SQFT',
                                 'Solar.Room.SQFT'
                  ]

attribute_data.drop(columns=columns_to_drop, inplace=True)

KeyError: "['Unnamed: 0', 'ACCOUNTNO', 'SIDING', 'SIDINGCOMMERCIAL', 'ROOFTYPE', 'ROOFCOVER', 'BEDROOMCOUNT', 'BATHCOUNT', 'BLTASSTORIES', 'CONSTRUCTIONSTYLE', 'Detached.Unfin.Area', 'Covered.Porch.SQFT', 'Enclosed.Porch.SQFT', 'Attached.Garage.SQFT', 'Built.In.Garage.SQFT', 'GREENHOUSE.SQFT', 'Solar.Room.SQFT'] not found in axis"

In [5]:
attribute_data.head()

Unnamed: 0,Parcelnumb,ACCTTYPE,X.CENTROID,Y.CENTROID,NUMBEROFUNITS,Detached.Garage.SQFT,FINISHED.SHED.SQFT,MASONRY.SHED.SQFT,METAL.SHED.SQFT,OPEN.SHED.SQFT,RES.BARN.AVERAGE.SQFT,RES.BARN.GOOD.SQFT,RES.BARN.LOW.SQFT,Wood.Deck.SQFT,WOOD.SHED.SQFT,WOODBURNING.STOVE.COUNT,roof_combust,siding_combust
0,379900000022,Agricultural,2737550.838,1238081.141,1.0,,,,,300.0,,,,,,,1,1
1,325700002044,Residential,2596049.988,1351554.456,1.0,,,,,,,,,1528.0,,9.0,1,1
2,370136351003,Residential,2590939.797,1264708.817,1.0,99.0,,,,,,,,340.0,,3.0,0,1
3,317723102001,Residential,2583598.549,1395667.36,1.0,,,,,,,,,1725.0,,3.0,0,1
4,425101000002,Residential,2432848.938,1168297.398,1.0,,,,,32.0,,,,64.0,90.0,3.0,0,1


In [6]:
#### MORE DATA CLEANING AND MUTATION ####

# Replace NaN values in 'Wood.Deck.SQFT' with 0
attribute_data['Wood.Deck.SQFT'].fillna(0, inplace=True)

# Replace NaN values in 'WOODBURNING.STOVE.COUNT' with 0
attribute_data['WOODBURNING.STOVE.COUNT'].fillna(0, inplace=True)

# Rename columns for consistency
attribute_data.rename(columns={'roof_combust': 'ROOF.COMBUST'}, inplace=True)
attribute_data.rename(columns={'siding_combust': 'SIDING.COMBUST'}, inplace=True)
attribute_data.rename(columns={'NUMBEROFUNITS': 'NUM.UNITS'}, inplace=True)
attribute_data.rename(columns={'Wood.Deck.SQFT': 'WOOD.DECK.SQFT'}, inplace=True)
attribute_data.rename(columns={'ACCTTYPE': 'PTY.TYPE'}, inplace=True)
attribute_data.rename(columns={'WOODBURNING.STOVE.COUNT' : 'WOOD.STOVE.COUNT'}, inplace = True)

# Function to determine DETACHED.STORAGE based on shed, barn, and garage columns
detached_storage_cols = ['Detached.Garage.SQFT',
                         'FINISHED.SHED.SQFT',
                         'MASONRY.SHED.SQFT',
                         'METAL.SHED.SQFT',
                         'OPEN.SHED.SQFT',
                         'WOOD.SHED.SQFT',
                         'RES.BARN.AVERAGE.SQFT',
                         'RES.BARN.GOOD.SQFT',
                         'RES.BARN.LOW.SQFT'
                        ]

def determine_detached_storage(row):
    if any(row[col] > 0 for col in detached_storage_cols):
        return 1
    else:
        return 0

# Create a new column DETACHED.STORAGE
attribute_data['DETACHED.STORAGE'] = attribute_data.apply(determine_detached_storage, axis=1)

# Drop columns from detached_storage_cols
attribute_data.drop(columns=detached_storage_cols, inplace=True)

attribute_data.head()

Unnamed: 0,Parcelnumb,PTY.TYPE,X.CENTROID,Y.CENTROID,NUM.UNITS,WOOD.DECK.SQFT,WOOD.STOVE.COUNT,ROOF.COMBUST,SIDING.COMBUST,DETACHED.STORAGE
0,379900000022,Agricultural,2737550.838,1238081.141,1.0,0.0,0.0,1,1,1
1,325700002044,Residential,2596049.988,1351554.456,1.0,1528.0,9.0,1,1,0
2,370136351003,Residential,2590939.797,1264708.817,1.0,340.0,3.0,0,1,1
3,317723102001,Residential,2583598.549,1395667.36,1.0,1725.0,3.0,0,1,0
4,425101000002,Residential,2432848.938,1168297.398,1.0,64.0,3.0,0,1,1


In [9]:
attribute_data.to_csv('attributes_pd.csv', index = False)

In [15]:
# ASSIGN RISK SCORES RANDOMLY TO PARCELS CONTAINING STRUCTURES

import pandas as pd
import numpy as np

# Set a seed for reproducability
np.random.seed(42)

# Generate random integers between 1 and 5 for 'RISK.SCORE'
attribute_data['RISK.SCORE'] = np.random.randint(1, 6, size=len(attribute_data))

# Convert 'RISK.SCORE' to ordered categorical data
attribute_data['RISK.SCORE'] = pd.Categorical(attribute_data['RISK.SCORE'],
                                              categories=[1, 2, 3, 4, 5],
                                              ordered=True)

# Print the updated DataFrame
print(attribute_data.head())


     Parcelnumb      PTY.TYPE   X.CENTROID   Y.CENTROID  NUM.UNITS  \
0  379900000022  Agricultural  2737550.838  1238081.141        1.0   
1  325700002044   Residential  2596049.988  1351554.456        1.0   
2  370136351003   Residential  2590939.797  1264708.817        1.0   
3  317723102001   Residential  2583598.549  1395667.360        1.0   
4  425101000002   Residential  2432848.938  1168297.398        1.0   

   WOOD.DECK.SQFT  WOOD.STOVE.COUNT  ROOF.COMBUST  SIDING.COMBUST  \
0             0.0               0.0             1               1   
1          1528.0               9.0             1               1   
2           340.0               3.0             0               1   
3          1725.0               3.0             0               1   
4            64.0               3.0             0               1   

   DETACHED.STORAGE RISK.SCORE  
0                 1          4  
1                 0          5  
2                 1          3  
3                 0          5  

In [16]:
# DROP ROWS WHERE THERE ARE NO STRUCTURES AND SAVE THIS AS A SEPARATE DF FOR TRAINING AND VALIDATION

# Drop rows where 'NUM.UNITS' == 0 and save the resulting DataFrame as a new one
risk_scores_df = attribute_data[attribute_data['NUM.UNITS'] != 0].copy()

# Reset the index if needed
risk_scores_df.reset_index(drop=True, inplace=True)

# Preview resulting df
print(risk_scores_df.head())

     Parcelnumb      PTY.TYPE   X.CENTROID   Y.CENTROID  NUM.UNITS  \
0  379900000022  Agricultural  2737550.838  1238081.141        1.0   
1  325700002044   Residential  2596049.988  1351554.456        1.0   
2  370136351003   Residential  2590939.797  1264708.817        1.0   
3  317723102001   Residential  2583598.549  1395667.360        1.0   
4  425101000002   Residential  2432848.938  1168297.398        1.0   

   WOOD.DECK.SQFT  WOOD.STOVE.COUNT  ROOF.COMBUST  SIDING.COMBUST  \
0             0.0               0.0             1               1   
1          1528.0               9.0             1               1   
2           340.0               3.0             0               1   
3          1725.0               3.0             0               1   
4            64.0               3.0             0               1   

   DETACHED.STORAGE RISK.SCORE  
0                 1          4  
1                 0          5  
2                 1          3  
3                 0          5  

In [18]:
# EXPORT CSV FOR ANALYSIS IN R

risk_scores_df.to_csv("C:/Users/by197116/Documents/Projects/GUC_ParcelLevelRisk/risk_scores_df.csv", index = False)

In [1]:
print(risk_scores_df.dtypes)

NameError: name 'risk_scores_df' is not defined