In [15]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [16]:
np.set_printoptions(suppress=True)
import warnings
warnings.filterwarnings('ignore')

def read_train_data():
    train = pd.read_csv("train.csv")
    return train

def read_test_data():
    test = pd.read_csv("test.csv")
    return test

def check_null_values_train():
    train = read_train_data()
    return train.isnull().sum()


def check_null_values_test():
    test = read_test_data()
    return test.isnull().sum()



def filling_values_train():
    train = read_train_data()
    train["Bed Grade"] = train["Bed Grade"].fillna(int(train["Bed Grade"].mode()))
    train["City_Code_Patient"] = train["City_Code_Patient"].fillna(int(train["City_Code_Patient"].mode()))
    return train


def filling_values_test():
    test = read_test_data()
    test["Bed Grade"] = test["Bed Grade"].fillna(int(test["Bed Grade"].mode()))
    test["City_Code_Patient"] = test["City_Code_Patient"].fillna(int(test["City_Code_Patient"].mode()))
    return test

In [17]:
np.random.seed(42)
pd.set_option('mode.chained_assignment', None)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

def label_encoding():
    train = filling_values_train()
    lee = le.fit(train["Stay"])
    train["Stay"] = lee.transform(train["Stay"])
    return train

def dummy_column():
    test = filling_values_test()
    test["Stay"] = -1
    return test



def concat():
    train = label_encoding()
    test = dummy_column()
    df = pd.DataFrame(columns=train.columns)
    df = pd.concat(train,test,ignore_index=True,axis=0)
    return df

def label_encoding_df():
    df = concat()
    for i in df[['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness', 'Age']]:
        mod = le.fit(df[i])
        df[i] = mod.transform(df[i])

    return df

In [18]:
def separating_train():
    df = label_encoding_df()
    train = df[df["Stay"]!=-1]
    return train

def separating_test():
    df = label_encoding_df()
    test = df[df["Stay"]==-1]
    return test

def drop_duplicates_test():
    test2 = test.copy()
    test1 = test2.drop(['Stay', 'patientid', 'Hospital_region_code', 'Ward_Facility_Code'],axis=1,inplace=False)
    return test1

def drop_duplicates_train():
    train2 = train.copy()
    train1 = train2.drop(['case_id', 'patientid', 'Hospital_region_code', 'Ward_Facility_Code'],axis=1,inplace=False)
    return train1

In [19]:
def get_countid_enocde(train, test, cols, name):
    temp = train.groupby(cols)['case_id'].count().reset_index().rename(columns = {'case_id': name})
    temp2 = test.groupby(cols)['case_id'].count().reset_index().rename(columns = {'case_id': name})
    train = pd.merge(train, temp, how='left', on= cols)
    test = pd.merge(test,temp2, how='left', on= cols)
    train[name] = train[name].astype('float')
    test[name] = test[name].astype('float')
    train[name].fillna(np.median(temp[name]), inplace = True)
    test[name].fillna(np.median(temp2[name]), inplace = True)
    return train, test

train = separating_train()
test = separating_test()
train, test = get_countid_enocde(train, test, ['patientid'], name = 'count_id_patient')
train, test = get_countid_enocde(train, test,
                                 ['patientid', 'Hospital_region_code'], name = 'count_id_patient_hospitalCode')
train, test = get_countid_enocde(train, test,
                                 ['patientid', 'Ward_Facility_Code'], name = 'count_id_patient_wardfacilityCode')

TypeError: concat() takes 1 positional argument but 2 positional arguments (and 2 keyword-only arguments) were given

In [20]:
train1 = drop_duplicates_train()

NameError: name 't3' is not defined

In [3]:
train1

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Available Extra Rooms in Hospital,Department,Ward_Type,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay,count_id_patient,count_id_patient_hospitalCode,count_id_patient_wardfacilityCode
0,8,2,3,3,3,2,2.0,7.0,0,0,2,5,4911.0,0,14.0,4.0,5.0
1,2,2,5,2,3,3,2.0,7.0,1,0,2,5,5954.0,4,14.0,4.0,5.0
2,10,4,1,2,1,3,2.0,7.0,1,0,2,5,4745.0,3,14.0,4.0,2.0
3,26,1,2,2,3,2,2.0,7.0,1,0,2,5,7272.0,4,14.0,6.0,3.0
4,26,1,2,2,3,3,2.0,7.0,1,0,2,5,5558.0,4,14.0,6.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,19,0,7,2,2,3,2.0,8.0,0,1,2,1,4894.0,0,3.0,3.0,2.0
4996,26,1,2,2,2,1,4.0,8.0,1,1,2,1,6987.0,3,3.0,3.0,1.0
4997,32,5,9,3,2,3,2.0,5.0,0,2,4,4,4196.0,5,3.0,2.0,1.0
4998,26,1,2,3,2,2,2.0,5.0,1,2,3,4,4560.0,2,3.0,2.0,1.0


In [4]:
def splitting():
    # do not edit the predefined function name
    train1 = t3.drop_duplicates_train()

    # Separate features (X) and target variable (y) from the 'train1' DataFrame
    # For x select the data without stay column, for  y select only stay column


    # Split the data into training and testing sets using train_test_split()

    # splits the feature DataFrame and target variable Series into training and testing datasets.
    # 80 % of the data is used for training and 20 % for testing, with a random seed of 100 for reproducibility.
    X = train1.drop("Stay",axis=1,inplace=False)
    y = train1["Stay"]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=100)
    #Return the training and testing data splits
    return X_train, X_test, y_train, y_test

In [11]:
def model():
    # do not edit the predefined function name
    X_train, X_test, y_train, y_test = splitting()
    import xgboost
    # Create the XGBoost classifier with specified hyperparameters
    global classifier_xgb
    classifier_xgb = xgboost.XGBClassifier(max_depth=4, learning_rate=0.1, n_estimators=800,
                                           objective='multi:softmax', reg_alpha=0.5, reg_lambda=1.5,
                                           booster='gbtree', n_jobs=4, min_child_weight=2, base_score=0.75)

    # Fit the XGBoost classifier to the training data (x_train and y_train)
    xgb = classifier_xgb.fit(X_train,y_train)

    # Make predictions on the x_test data
    y_pred = xgb.predict(X_test)

    # Calculate the accuracy score on the test data
    from sklearn.metrics import accuracy_score
    acc_score_xgb = accuracy_score(y_test,y_pred).round(2)

    # Return the accuracy score rounded to two decimal places
    return acc_score_xgb

In [26]:
def predict():
    # do not edit the predefined function name
    model()
    test1 = t3.drop_duplicates_test()

    # Make predictions on the test data using the trained XGBoost classifier
    pred_xgb = classifier_xgb.predict(test1.iloc[:, 1:])

    # Create a DataFrame to store the predicted 'Stay' values

    # Add the 'case_id' column from the original test data to the result DataFrame

    # Rearrange the columns in the DataFrame to have 'case_id' as the first column and 'Stay' as the second column

    # Map numeric predictions to their corresponding 'Stay' categories using the .replace() method
    # replace the following data in stay column with 0: '0-10', 1: '11-20', 2: '21-30', 3: '31-40', 4: '41-50', 5: '51-60', 6: '61-70', 7: '71-80', 8: '81-90', 9: '91-100', 10: 'More than 100 Days'
    df = pd.DataFrame()
    df["Stay"] = pred_xgb
    df["case_id"] = test1["case_id"]
    df = df[["case_id","Stay"]]
    df["Stay"] = df["Stay"].replace({0: '0-10', 1: '11-20', 2: '21-30', 3: '31-40', 4: '41-50', 5: '51-60', 6: '61-70', 7: '71-80', 8: '81-90', 9: '91-100', 10: 'More than 100 Days'})
    # Return the DataFrame with 'case_id' and predicted 'Stay' values
    return df

In [33]:
result_xgb = predict()

In [43]:
xbg_grp = result_xgb.groupby("Stay")
xbg = xbg_grp.count()
a = pd.Series(index=xbg.index)
a = xbg["case_id"]

In [44]:
a

Stay
0-10                   58
11-20                 380
21-30                 879
31-40                 235
41-50                  28
51-60                 307
61-70                   7
71-80                  46
81-90                  24
91-100                  6
More than 100 Days     30
Name: case_id, dtype: int64