# Factor analysis of mixed data (FAMD)

In [3]:
import pandas as pd
pd.options.plotting.backend = "plotly"
pd.set_option('display.max_columns', None)

import numpy as np

X_train = pd.read_csv('../data/train_values.csv', index_col='building_id')
y_train = pd.read_csv('../data/train_labels.csv', index_col='building_id')

X_test = pd.read_csv('../data/test_values.csv', index_col='building_id')


X_train[['geo_level_1_id','geo_level_2_id','geo_level_3_id']] = X_train[['geo_level_1_id','geo_level_2_id','geo_level_3_id']].astype(str)
X_test[['geo_level_1_id','geo_level_2_id','geo_level_3_id']] = X_test[['geo_level_1_id','geo_level_2_id','geo_level_3_id']].astype(str)

X_train[['count_floors_pre_eq','age','area_percentage','height_percentage']] = X_train[['count_floors_pre_eq','age','area_percentage','height_percentage']].astype(float)

X_train['volume_percentage']=X_train['area_percentage'] * X_train['height_percentage']
X_test['volume_percentage']=X_test['area_percentage'] * X_test['height_percentage']

# Categorical columns 
categorical_columns = [c for c in X_train.select_dtypes(include=['object'])]
numerical_columns= list(set(X_train.columns) - set(categorical_columns))

### Sample subset (1000) from each target class

In [4]:
from sklearn.utils import resample

def sub_sample(X, y, num_):
    data = pd.concat([X, y], axis=1)

    # Separate classes
    damage_3 = data[data['damage_grade']==3]
    damage_2 = data[data['damage_grade']==2]
    damage_1  = data[data['damage_grade']==1]
 
    damage_1_sample = resample(damage_1, 
                               replace=False,
                               n_samples=num_)
    
    damage_2_sample = resample(damage_2, 
                               replace=False,
                               n_samples=num_)    

    damage_3_sample = resample(damage_3, 
                               replace=False,
                               n_samples=num_)

    # Combine majority class with upsampled minority classes
    data_upsampled = pd.concat([damage_1_sample, damage_2_sample, damage_3_sample])

    y_sampled = data_upsampled[['damage_grade']]
    X_sampled = data_upsampled.drop(['damage_grade'], axis=1)

    return (X_sampled, y_sampled)

X_train_sub, y_train_sub = sub_sample(X=X_train,y=y_train,num_=1000)

## Fit and plot data into FAMD subspace

In [5]:
import prince

famd = prince.FAMD( n_components=2, n_iter=3, copy=True,
                   check_input=True, random_state=42, engine="sklearn")

famd = famd.fit(X_train_sub)

  X = self.scaler_.transform(X.to_numpy())


In [6]:
import plotly.express as px

X_train_2d = famd.row_coordinates(X_train_sub)
X_train_2d.columns = ["C1","C2"]
X_train_2d = pd.concat([X_train_2d, y_train_sub], axis=1) 

px.scatter(X_train_2d, x='C1', y='C2', color='damage_grade',
           width=800,height=500, title="Factor analysis of mixed data").show()

  X = self.scaler_.transform(X.to_numpy())
