In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score,precision_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_excel('Data_08_02_2024.xlsx')
print(df.isna().mean())
print('Old shape',df.shape)

threshold = len(df) * 0.20 # Calculate 20% of the total number of rows
dropped_columns = list(df.columns[df.isna().sum() > threshold])
df = df.dropna(thresh=threshold, axis=1)
print("Columns deleted:", dropped_columns)

print('New shape',df.shape)
df.drop(columns='business_id',inplace=True)

y = df[['hard_closure' ,'soft_closure']]
X = df.drop(columns=['hard_closure' ,'soft_closure'])

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import pandas as pd

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

preprocessor.fit(X)
processed_data = preprocessor.transform(X)

X = pd.DataFrame(processed_data)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print(y['hard_closure'].value_counts(normalize=True)*100)
print(y['soft_closure'].value_counts(normalize=True)*100)

In [31]:
from sklearn.metrics import classification_report,accuracy_score
from sklearn import ensemble,neighbors,tree

from tqdm.notebook import tqdm
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# https://scikit-learn.org/stable/modules/multiclass.html
MLA = [
    ensemble.RandomForestClassifier(random_state = 1,n_jobs=-1),
    ensemble.ExtraTreesClassifier(n_jobs=-1),
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
]

MLA_columns = ['MLA_names', 'Hard_Test_Accuracy','Hard_Test_F1','Hard_Test_Precision',
                            'Soft_Test_Accuracy','Soft_Test_F1','Soft_Test_Precision']

classifiers = {}
def apply_models(X_train,y_train,X_test,y_test):
    MLA_compare = pd.DataFrame(columns = MLA_columns)

    row_index = 0
    for alg in tqdm(MLA):
        try:
            MLA_name = alg.__class__.__name__
            MLA_compare.loc[row_index, 'MLA_names'] = MLA_name

            multi_target_forest = MultiOutputClassifier(alg, n_jobs=-1)
            multi_target_forest.fit(X_train, y_train)
            
            y_pred = multi_target_forest.predict(X_test)
           
            classifiers[MLA_name] = multi_target_forest
            ############## Hard closure results ###########
            acc = accuracy_score(y_test['hard_closure'],y_pred[:,0])
            f1 = f1_score(y_test['hard_closure'],y_pred[:,0])
            precision = precision_score(y_test['hard_closure'],y_pred[:,0])
            
            MLA_compare.loc[row_index, 'Hard_Test_Accuracy'] = acc
            MLA_compare.loc[row_index, 'Hard_Test_F1'] = f1
            MLA_compare.loc[row_index, 'Hard_Test_Precision'] = precision
            
            ########## Soft closure results ############
            acc = accuracy_score(y_test['soft_closure'],y_pred[:,1])
            f1 = f1_score(y_test['soft_closure'],y_pred[:,1])
            precision = precision_score(y_test['soft_closure'],y_pred[:,1])
            
            MLA_compare.loc[row_index, 'Soft_Test_Accuracy'] = acc
            MLA_compare.loc[row_index, 'Soft_Test_F1'] = f1
            MLA_compare.loc[row_index, 'Soft_Test_Precision'] = precision
            
            row_index += 1
        except:
            print(type(alg).__name__)
    return MLA_compare

In [32]:
print(apply_models(X_train,y_train,X_test,y_test))

  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,MLA_names,Hard_Test_Accuracy,Hard_Test_F1,Hard_Test_Precision,Soft_Test_Accuracy,Soft_Test_F1,Soft_Test_Precision
0,RandomForestClassifier,0.906023,0.268551,0.640449,0.999849,0.999878,0.999755
1,ExtraTreesClassifier,0.900424,0.212919,0.539394,0.972609,0.977959,0.973219
2,DecisionTreeClassifier,0.859791,0.331771,0.321454,1.0,1.0,1.0
3,ExtraTreeClassifier,0.852149,0.271982,0.271982,0.88915,0.910469,0.909413


# Prediction

In [33]:
print(classifiers.keys())

test_data = df.sample(10)

y_test_data = test_data[['hard_closure' ,'soft_closure']]
X_test_data = test_data.drop(columns=['hard_closure' ,'soft_closure'])

X_test_data_processed = preprocessor.transform(X_test_data)
X_test_data_processed = pd.DataFrame(X_test_data_processed)

clf = classifiers['DecisionTreeClassifier']
pred = clf.predict(X_test_data_processed)
newdf = pd.DataFrame(pred,columns=['hard_closure' ,'soft_closure'])
print(newdf)

dict_keys(['RandomForestClassifier', 'ExtraTreesClassifier', 'DecisionTreeClassifier', 'ExtraTreeClassifier'])