In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import svm
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
# import shap
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
train = pd.read_csv('C:/Users/KIHyuk/Desktop/dacon_data/Data_병원개폐업/train.csv')

In [4]:
#Reading the train and test files
train_prod_df = pd.read_csv('C:/Users/KIHyuk/Desktop/dacon_data/Data_병원개폐업/train.csv')
test_prod_df = pd.read_csv('C:/Users/KIHyuk/Desktop/dacon_data/Data_병원개폐업/test.csv')

# #Removing the comma in the employee1 and 2 columns in the test dataset and replace it with empty space and convert it to float format.
test_prod_df.employee1 = test_prod_df.employee1.astype('str').str.replace(",", "").astype('float')
test_prod_df.employee2 = test_prod_df.employee2.astype('str').str.replace(",", "").astype('float')

# #Converting the employee1 and 2 column as float in the train set as done for the test dataset
train_prod_df.employee1 = train_prod_df.employee1.astype('float')
train_prod_df.employee2 = train_prod_df.employee2.astype('float')
train_prod_df.OC= train_prod_df.OC.astype('str').str.replace(" ","")

# #Combining the train and test dataset
train_test_prod = train_prod_df.append(test_prod_df)

# #Get the object and numeric columns seperately 
factor_columns = train_test_prod.select_dtypes(include = ['object']).columns
numeric_columns = train_test_prod.columns.difference(factor_columns)

# #After analysis realized that the bed counts of these two hospitals may have had wrong entries.
# #Filling up the empty instkind and bedCount for hospital id 430 and 413
train_test_prod.loc[train_test_prod.inst_id == 430, ['instkind']] = 'dental_clinic'
train_test_prod.loc[train_test_prod.inst_id == 430, ['bedCount']] = 0
train_test_prod.loc[train_test_prod.inst_id == 413, ['bedCount']] = -999

# #Fill the empty values in the object columns as "Not sure"
train_test_prod[factor_columns] = train_test_prod[factor_columns].fillna('Not_sure')

# #Fill all the empty values in the numeric columns as -999
train_test_prod[numeric_columns] = train_test_prod[numeric_columns].fillna(-999)

# #Convert all the object columns to numeric since the ML algorithms don't accept object features directly 
fac_le = LabelEncoder()
# Nan => 0, close => 1, open => 2
train_test_prod[factor_columns] = train_test_prod.loc[:,factor_columns].apply(lambda x : fac_le.fit_transform(x))

# #Splitting back data to train prod and test prod
train_prod = train_test_prod.loc[train_test_prod.OC != 0,]
test_prod = train_test_prod.loc[train_test_prod.OC == 0,] 
train_prod['OC'] = train_prod['OC'] - 1 # close=>0, open=1

# #Obtain the submission ID to create the submission file later
sub_id = test_prod.inst_id

# #Get the dependent and independent column
dep = 'OC'
indep = train_prod.columns.difference([dep])

train_prod_X = train_prod[indep]
train_prod_Y = train_prod[dep]
test_prod_X = test_prod[indep]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [7]:
############################################################################
############ Random Forest
############################################################################
estimators = 10
np.random.seed(100)
RF_prod = RandomForestClassifier(n_estimators = estimators)
RF_prod_model = RF_prod.fit(train_prod_X, train_prod_Y)
RF_prod_prediction = RF_prod.predict_proba(test_prod_X)[:,1]
sub_RF = pd.DataFrame({'inst_id' : sub_id , 'OC' : RF_prod_prediction })
sub_RF = sub_RF[['inst_id', 'OC']]

In [8]:
############################################################################
############ Support Vector Machine
############################################################################

from sklearn import svm
svm_classifier = svm.SVC(C=1.0, gamma=0.001,probability=True)

svm_classifier.fit(train_prod_X,train_prod_Y)
svm_classifier_prediction = svm_classifier.predict_proba(test_prod_X)[:,1]
sub_SVM = pd.DataFrame({'inst_id' : sub_id , 'OC' : svm_classifier_prediction })
sub_SVM = sub_RF[['inst_id', 'OC']]

In [9]:
############################################################################
#Ensembling the three models
############################################################################

#Forming the ensemble dataset of the 3 models
ensemble = pd.DataFrame()
ensemble['inst_id'] = sub_SVM['inst_id']
ensemble['SVM'] = sub_SVM['OC']
# ensemble['GBM'] = sub_GBM['OC']
ensemble['RF'] = sub_RF['OC']

# Taking the average of all 3 models
ensemble['ens'] = (ensemble['SVM'] + ensemble['RF'])/2
ensemble['OC'] = (ensemble['ens'] > 0.7).astype('int') #0.7 as the threshold and above that the hospital is closed.

#Printing to see all the hospitals that are classified as closed 
print(ensemble.loc[ensemble['OC'] == 0, ])

ensemble = ensemble.loc[:, ['inst_id', 'OC']]

#ensemble.to_csv('ens_XGB_7_RF_4_GBM_2_39.csv', index = False)

     inst_id  SVM   RF  ens  OC
2          6  0.6  0.6  0.6   0
8         21  0.7  0.7  0.7   0
10        24  0.7  0.7  0.7   0
22        64  0.1  0.1  0.1   0
23        66  0.7  0.7  0.7   0
40       123  0.7  0.7  0.7   0
45       133  0.7  0.7  0.7   0
48       151  0.7  0.7  0.7   0
52       165  0.6  0.6  0.6   0
78       229  0.6  0.6  0.6   0
84       258  0.6  0.6  0.6   0
88       293  0.7  0.7  0.7   0
99       341  0.4  0.4  0.4   0
120      413  0.4  0.4  0.4   0
122      424  0.3  0.3  0.3   0
123      425  0.7  0.7  0.7   0
124      429  0.7  0.7  0.7   0
126      431  0.6  0.6  0.6   0
