In [1]:
!pip install --q catboost
!pip install --q cleanlab

[0m

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import cleanlab

from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

In [3]:
pd.options.mode.chained_assignment = None

# Constants

In [4]:
wb_mapping = {
    0: 'Unknown/not stated',
    1: 'Clay',
    2: 'Mortar + Cement',
    3: 'Mortar + Cement, Clay',
    5: 'Mud + Mortar, Clay',
    7: 'Mud + Mortar, Clay, Cement + Mortar'
}

wm_mapping = {
    0: 'Unknown/not stated',
    1: 'Red Bricks',
    2: 'Stone Bricks',
    3: 'Red Bricks, Stone Bricks'
}

In [5]:
test_number_dict = {
    'two story': 2, 'floor second': 2, 'Floor two': 2, 'There is 2 Floor/Story': 2, 
    'Floor 2': 2, 'two': 2, ' just 2 floor': 2, '2': 2, '2 floor': 2, 'Two Floor': 2, 
    'floor two': 2, 'floor third': 3, 'three Story': 3, 'Three': 3, 'Floor-three': 3, 
    '3.00': 3, 'Floor 3': 3, 'Three floor': 3, ' has 3 Floor': 3, '1': 1, 'one': 1, 
    '1st Story': 1, 'floor one': 1, 'one story': 1, 'Has 1 floor': 1, 'Just 1 floor': 1, 
    'floor 1st': 1, 'Floor 1': 1, 'four Floor': 4, 'Floor 4': 4, 'Fl four': 4, 'Four': 4, 
    'Four Story': 4, 'floor four': 4, ' has Four fl': 4, '4': 4, '6': 6, 'Fl Five': 5, 
    ' Has Five fl': 5, 'fifth': 5, 'five Floor': 5, 'Floor Fifth': 5, '5': 5, '7': 7, 
    '9': 9
}

# Pre-defined Function (Basic pre-processing)

In [6]:
#@title extract_num_floors
def extract_num_floors(floor):
    """
    This function extracts the number of floors from the 'floors_before_eq' column
    and converts it to an integer. If the value is missing or cannot be converted
    to an integer, it returns None.

    Arguments:
    - floor: The value in the 'floors_before_eq' column for a particular building

    Returns:
    - An integer representing the number of floors for the given building, or None
      if the value is missing or cannot be converted to an integer.
    """
    import re

    if floor is None:
        return None
    else:
        floor = str(floor)
        floor = floor.lower()
        if 'one' in floor or 'first' in floor or 'ground' in floor:
            return 1
        elif 'two' in floor or 'second' in floor:
            return 2
        elif 'three' in floor or 'third' in floor:
            return 3
        elif 'four' in floor or 'fourth' in floor:
            return 4
        elif 'five' in floor or 'fifth' in floor:
            return 5
        else:
            floor = re.findall(r'\d+', floor)
            if len(floor) > 0:
                return int(floor[0])
            else:
                return None

In [7]:
#@title standardize_value
def standardize_value(data):
    """
    This function takes a pandas dataframe as input and returns a new pandas dataframe
    where all the categorical values are standardized. For example, if 'RCC' and 'RC' 
    are used interchangeably in the 'foundation_type' column, this function will standardize 
    all the 'RCC' values to 'RC' to avoid any inconsistencies.
    """

    # type_of_foundation
    data['type_of_foundation'] = data['type_of_foundation'].replace('RC', 'Reinforced Concrete')
    data['type_of_foundation'] = data['type_of_foundation'].replace(['Bamboo/TImber', 'Bamboo or Timber', 'Bamboo/Timber'], 'Bamboo/Timber')
    data['type_of_foundation'] = data['type_of_foundation'].replace(['Other', 'Others'], 'Other')
    data['type_of_foundation'] = data['type_of_foundation'].replace(['Cement-Stone/Brick', 'Cement-Stone or Cement-Brick'], 'Cement-Stone/Brick')
    
    # type_of_roof
    data['type_of_roof'] = data['type_of_roof'].replace(['Bamboo/Timber Light roof', 'Bamboo/TImber-Light Roof', 'Bamboo or Timber Light roof'], 'Bamboo/Timber Light roof')
    data['type_of_roof'] = data['type_of_roof'].replace(['Bamboo/Timber Heavy roof ', 'Bamboo/TImber-Heavy Roof', 'Bamboo or Timber Heavy roof'], 'Bamboo/Timber Heavy roof')
    data['type_of_roof'] = data['type_of_roof'].replace(['reinforced cement concrete/rb/rbc', 'rcc/rb/rbc', 'Reinforced Brick Slab/rcc/rbc', 'Reinforced brick concrete/rcc/rbc'], 'rcc/rb/rbc')
    
    # type_of_ground_floor
    data['type_of_ground_floor'] = data['type_of_ground_floor'].replace(['mud', 'Mud'], 'Mud')
    data['type_of_ground_floor'] = data['type_of_ground_floor'].replace(['reinforced concrete', 'Reinforced Concrete', 'RC'], 'Reinforced Concrete ')
    data['type_of_ground_floor'] = data['type_of_ground_floor'].replace(['Brick/Stone', 'brick/stone', 'Brick or Stone'], 'Brick/Stone')
    data['type_of_ground_floor'] = data['type_of_ground_floor'].replace(['TImber', 'Timber', 'Wood', 'Lumber'], 'Wood')

    # type_of_other_floor
    data['type_of_other_floor'] = data['type_of_other_floor'].replace(['TImber/Bamboo-Mud', 'Timber Mud or Bamboo-Mud', 'Wood or Bamboo Mud'], 'Wood or Bamboo Mud')
    data['type_of_other_floor'] = data['type_of_other_floor'].replace(['Timber-Planck', 'Lumber-plank', 'Wood-Plank'], 'Wood-Plank')
    data['type_of_other_floor'] = data['type_of_other_floor'].replace(['rcc/rb/rbc', 'reinforced cement concrete/rb/rbc', 'Reinforced brick concrete/rcc/rbc'], 'rcc/rb/rbc')
    data['type_of_other_floor'] = data['type_of_other_floor'].replace(['Wood or Bamboo Mud', 'Wood-Mud or Bamboo Mud'], 'Wood-Mud or Bamboo Mud ')

    # legal_ownership_status
    data['legal_ownership_status'] = data['legal_ownership_status'].replace(['Private', 'Private Use', 'Prvt', 'Privste'], 'Private')
    data['legal_ownership_status'] = data['legal_ownership_status'].replace(['Public', 'Public Use', 'Public Space'], 'Public')
    data['legal_ownership_status'] = data['legal_ownership_status'].replace(['Institutional', 'Institutional Use', 'Institutionals'], 'Institutional')
    data['legal_ownership_status'] = data['legal_ownership_status'].replace(['Other', 'Unknown', 'Unspecified'], 'Other')

    return data

In [8]:
#@title find_label_issue_train
def find_label_issue_train(data):
    """
    Find label errors in the training data using the Confident Learning framework.

    Arguments:
    - data: A pandas DataFrame containing the training data.

    Returns:
    - A list of indices of examples in the training data that are likely to have label errors.

    References: 
    """
    # Step 1: Apply a machine learning model to the training data to obtain predicted labels.
    # This model should be trained using standard techniques and should not take into account the issue of label errors.
    # Here, we assume that the model is already trained and is available for use.

    # Step 2: Compute the empirical distribution of the predicted labels.
    # This distribution will be used later to estimate the noise distribution.

    # Step 3: Compute the joint distribution of the predicted labels and the true labels.
    # This distribution can be estimated using the empirical distribution of the predicted labels and an estimate of the noise distribution.

    # Step 4: Rank the examples in the training data by their likelihood of being mislabeled.
    # This can be done by computing the conditional probability of the true label given the predicted label.

    # Step 5: Identify examples that are likely to be mislabeled.
    # This can be done by selecting examples with low confidence and high probability of being mislabeled.

    # Step 6: Return the indices of the examples that are likely to be mislabeled.
    # These examples can be further examined to determine if they are indeed mislabeled, and corrected if necessary.
    # Alternatively, they can be removed from the training data to improve the performance of the machine learning model.

    X = data[data.columns.difference(['damage_grade'])]
    y = data['damage_grade']

    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(sparse=False)

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    X = preprocessor.fit_transform(X)

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    model = XGBClassifier(tree_method="hist", enable_categorical=True)
    pred_probs = cross_val_predict(model, X, y, method='predict_proba')

    cl_issue_idx = cleanlab.filter.find_label_issues(y, pred_probs, return_indices_ranked_by='self_confidence')
    
    selected_rows = data.iloc[cl_issue_idx]
    inverted_selection = data[~data.index.isin(cl_issue_idx)]

    return inverted_selection

In [9]:
#@title complete_pipeline
def complete_pipeline(data):
    """
    This function takes a pandas dataframe as input, processes it through
    various steps including missing value imputation, feature engineering,
    and standardization, and returns the processed dataframe ready for
    machine learning modeling. The function is designed to handle both
    training and testing data by distinguishing between them based on the
    presence of the 'damage_grade' column.
    """

    # floors_before_eq (total)
    data['floors_before_eq (total)'] = data['floors_before_eq (total)'].apply(lambda x: extract_num_floors(x))

    # missing value
    data = data[data['floors_before_eq (total)'].notna()]
    data = data[data['plinth_area (ft^2)'].notna()]
    data['technical_solution_proposed'] = data['technical_solution_proposed'].fillna('Unspecified')

    # replace and lambda
    data['plinth_area (ft^2)'] = data['plinth_area (ft^2)'].str.extract('(\d+)').astype(float)
    
    data['no_family_residing'] = data['no_family_residing'].replace('None', 0).astype(float)
    data['wall_binding'] = data['wall_binding'].replace(wb_mapping)
    data['wall_material'] = data['wall_material'].replace(wm_mapping)

    data['type_of_reinforcement_concrete'] = data['type_of_reinforcement_concrete'].apply(lambda x: f'Type: {str(x)}')
    
    # value standardization
    data = standardize_value(data)

    # same group of features, different label
    data = data.groupby(list(data.columns.difference(['damage_grade']))).filter(lambda x: len(set(x['damage_grade'])) == 1)

    # label issue in train data
    data = data.reset_index()
    data.drop(['index'], axis=1, inplace=True)
    data = find_label_issue_train(data)
    
    return data

In [10]:
#@title cross_val_multiclass
def cross_val_multiclass(model, X, y, num_folds=5):
    from sklearn.model_selection import KFold, cross_val_score
    from sklearn.metrics import f1_score
    import numpy as np

    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    f1_scores = []
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        model.fit(X_train, y_train, verbose=0)
        y_pred = model.predict(X_test)
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    return np.mean(f1_scores), np.std(f1_scores)

In [11]:
#@title categorical_correlation
def categorical_correlation(var1, var2):
    import scipy.stats as stats

    contingency_table = pd.crosstab(var1, var2)
  
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    
    n = contingency_table.sum().sum()
    phi2 = chi2/n
    r, k = contingency_table.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    V = np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
    
    return {"Chi-squared test": {"test statistic": chi2, "p-value": p}, "Cramer's V": V}

# Basic Data Cleaning

In [12]:
train = pd.read_csv('/kaggle/input/jointsugm/train.csv')
train.drop(['Unnamed: 0'], axis=1, inplace=True)

train = complete_pipeline(train)
train = train.reset_index()
train.drop(['index'], axis=1, inplace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [13]:
# Inverse (will be predicted)
train['technical_solution_proposed'] = train['technical_solution_proposed'].replace('Unspecified', np.nan)

### Predict the missing technical_solution_proposed for imputing train

In [14]:
titr = train.copy()

In [15]:
titr = titr[titr['technical_solution_proposed'].notna()]

In [16]:
X_titr = titr[titr.columns.difference(['technical_solution_proposed'])]
y_titr = titr['technical_solution_proposed']

numerical_cols = X_titr.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_titr.select_dtypes(include=['object']).columns.tolist()

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(sparse=False)

titr_preproc = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

titr_preproc = titr_preproc.fit(train[train.columns.difference(['technical_solution_proposed'])])
X_titr = titr_preproc.transform(X_titr)

titr_le = LabelEncoder()
y_titr = titr_le.fit_transform(y_titr)

In [17]:
cb_titr = XGBClassifier()
cb_titr.fit(X_titr, y_titr, verbose=0)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

In [18]:
train.loc[train['technical_solution_proposed'].isna(), 'technical_solution_proposed'] = titr_le.inverse_transform(cb_titr.predict(titr_preproc.transform(train.loc[train['technical_solution_proposed'].isna(), train.columns.difference(['technical_solution_proposed'])]))).ravel()

# Modeling

In [19]:
X = train[train.columns.difference(['damage_grade'])]
y = train['damage_grade']

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(sparse=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

X = preprocessor.fit_transform(X)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [20]:
# Create train-validation-test split with stratification
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.6, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42, stratify=y_val)

## Manual

### LightGBM

In [21]:
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train, verbose=0, eval_set=(X_val, y_val))



LGBMClassifier()

In [22]:
y_val_pred = lgbm.predict(X_val)
print(classification_report(y_val, y_val_pred, target_names=['1', '2', '3', '4', '5']))

              precision    recall  f1-score   support

           1       0.99      0.99      0.99      4562
           2       0.98      0.99      0.98      3230
           3       1.00      1.00      1.00      5108
           4       0.75      0.53      0.62      6808
           5       0.85      0.94      0.89     18992

    accuracy                           0.88     38700
   macro avg       0.91      0.89      0.90     38700
weighted avg       0.88      0.88      0.88     38700



### CatBoost

In [23]:
catboost = CatBoostClassifier()
catboost.fit(X_train, y_train, verbose=0, eval_set=(X_val, y_val))

<catboost.core.CatBoostClassifier at 0x79fe938fce50>

In [24]:
y_val_pred = catboost.predict(X_val)
print(classification_report(y_val, y_val_pred, target_names=['1', '2', '3', '4', '5']))

              precision    recall  f1-score   support

           1       0.99      0.98      0.99      4562
           2       0.97      0.99      0.98      3230
           3       1.00      1.00      1.00      5108
           4       0.74      0.55      0.63      6808
           5       0.85      0.93      0.89     18992

    accuracy                           0.88     38700
   macro avg       0.91      0.89      0.90     38700
weighted avg       0.88      0.88      0.88     38700



### XGBoost

In [25]:
xgboost = XGBClassifier()
xgboost.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=0, reg_alpha=0, ...)

In [26]:
y_val_pred = xgboost.predict(X_val)
print(classification_report(y_val, y_val_pred, target_names=['1', '2', '3', '4', '5']))

              precision    recall  f1-score   support

           1       0.99      0.99      0.99      4562
           2       0.98      0.99      0.98      3230
           3       1.00      1.00      1.00      5108
           4       0.75      0.54      0.63      6808
           5       0.85      0.94      0.89     18992

    accuracy                           0.88     38700
   macro avg       0.91      0.89      0.90     38700
weighted avg       0.88      0.88      0.88     38700



## CV

In [27]:
X_cv, _, y_cv, _ = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)

### LightGBM

In [28]:
f1_avg, f1_std = cross_val_multiclass(lgbm, X_cv, y_cv)



In [29]:
print(f'Avg: {f1_avg}, Std: {f1_std}')

Avg: 0.8975014586431861, Std: 0.0021159408021302014


### CatBoost

In [30]:
f1_avg, f1_std = cross_val_multiclass(catboost, X_cv, y_cv)

In [31]:
print(f'Avg: {f1_avg}, Std: {f1_std}')

Avg: 0.8978941109293924, Std: 0.0023774648080181946


### XGBoost

In [32]:
f1_avg, f1_std = cross_val_multiclass(xgboost, X_cv, y_cv)

In [33]:
print(f'Avg: {f1_avg}, Std: {f1_std}')

Avg: 0.8994727843053253, Std: 0.0026312125680375826
