In [None]:
# importing the library
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load the dataset
train = pd.read_csv('4910797b-ee55-40a7-8668-10efd5c1b960.csv')
label = pd.read_csv('0bf8bc6e-30d0-4c50-956a-603fc693d966.csv')
test = pd.read_csv('702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv')

In [None]:
train.shape

In [None]:
test.shape

In [None]:
label.shape

In [None]:
label.head()

In [None]:
train.head()

In [None]:
# combine the train with the label of the data
data = train.merge(label, on = 'id')

In [None]:
data.info()

In [None]:
null_rate = data.isna().sum()/len(data)
null_rate.sort_values(ascending= False).head(7)

In [None]:
null_rates = test.isna().sum()/len(test)
null_rates.sort_values(ascending= False).head(7)

scheme_name variable contains most null values, so we need to find what is inside it

In [None]:
data[['scheme_name', 'scheme_management', 'installer', 'funder', 'public_meeting', 'permit', 'subvillage']].nunique()

there are to many distinct count of unique values of scheme_name, installer, funder, and subvillage. so we need to drop the columns and fill the rest null columns with the mode of the columns

In [None]:
data['permit'] = data['permit'].fillna(data['permit'].mode()[0])
data['public_meeting'] = data['public_meeting'].fillna(data['public_meeting'].mode()[0])
data['scheme_management'] = data['scheme_management'].fillna('Other')

test['permit'] = data['permit'].fillna(data['permit'].mode()[0])
test['public_meeting'] = data['public_meeting'].fillna(data['public_meeting'].mode()[0])
test['scheme_management'] = data['scheme_management'].fillna('Other')

In [None]:
data.drop(columns = ['scheme_name', 'installer', 'funder', 'subvillage'], inplace= True)
test.drop(columns = ['scheme_name', 'installer', 'funder', 'subvillage'], inplace= True)

In [None]:
data.head()

In [None]:
data.select_dtypes('object').nunique()

In [None]:
data.drop(['wpt_name', 'ward'], axis=1, inplace=True)
test.drop(['wpt_name', 'ward'], axis=1, inplace=True)

In [None]:
data['date_recorded'] = pd.to_datetime(data['date_recorded'])
data['year'] = data['date_recorded'].dt.year
data['month'] = data['date_recorded'].dt.month
data['date'] = data['date_recorded'].dt.day
data.drop('date_recorded', axis=1, inplace=True)

test['date_recorded'] = pd.to_datetime(test['date_recorded'])
test['year'] = test['date_recorded'].dt.year
test['month'] = test['date_recorded'].dt.month
test['date'] = test['date_recorded'].dt.day
test.drop('date_recorded', axis=1, inplace=True)

In [None]:
data.head()

In [None]:
sns.countplot(data['status_group'])

In [None]:
 # the data is imbalanced, so i will do the data oversampling and undersampling
func = data[data['status_group']=='functional']
no_func = data[data['status_group']=='non functional']
repair = data[data['status_group']=='functional needs repair']

print(func.shape)
print(no_func.shape)
print(repair.shape)

In [None]:
func = func.sample(22824)
repair = repair.sample(22824, replace= True)

In [None]:
print(func['status_group'].value_counts())
print(no_func['status_group'].value_counts())
print(repair['status_group'].value_counts())

In [None]:
# now we have same number for each status, let's concat into one dataframe
train_data = pd.concat([func, no_func, repair])

In [None]:
train_data['status_group'].value_counts()

In [None]:
# splitting the feature and label
x = train_data.drop('status_group', axis=1)
y = train_data['status_group'].map({'functional':0, 'non functional':1, 'functional needs repair':2})

In [None]:
x

In [None]:
y.unique()

In [None]:
# selecting categorical feature for catboost and label encode (other models)
cat_features = np.where(data.select_dtypes('object'))[1]
print(cat_features)

In [None]:
# split the dataset
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
from catboost import CatBoostClassifier

In [None]:
categorical_features_indices = np.where(x.dtypes == np.object)[0]
categorical_features_indices

In [None]:
# creating catboost model
model=CatBoostClassifier(iterations=200, depth=3, loss_function= 'MultiClass',learning_rate=0.05,train_dir= 'crossentropy',
    allow_writing_files= False, random_seed= 42)
model.fit(x_train, y_train, cat_features= categorical_features_indices, eval_set=(x_val, y_val),plot=True)

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score

In [None]:
prediction = model.predict(x_val)

In [None]:
# see the accuracy of this model
print(accuracy_score(prediction, y_val))
class_names = ['functional','non-functional','needs repair']
disp = plot_confusion_matrix(model, x_val, y_val,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize='true')
plt.show()

from the result, we can see catboost model gain <70% accuracy, now we will try another models

In [None]:
# create new data for other models
x = train_data.drop('status_group', axis=1)
y = train_data['status_group'].map({'functional':0, 'non functional':1, 'functional needs repair':2})

In [None]:
# import some machine learning models
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [None]:
cat_features = x.select_dtypes('object').columns
x_train[cat_features]

In [None]:
# encode each categorical columns
le = LabelEncoder()
for column in cat_features:
    x[column] = le.fit_transform(x[column])

In [None]:
# split and scale the dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
le = LabelEncoder()
for column in cat_features:
    x[column] = le.fit_transform(x[column])
scaler = MinMaxScaler(feature_range=(0,1))
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
# creating models
models = []
models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=900, max_depth=20)))
models.append(('BaggingClassifier', BaggingClassifier(DecisionTreeClassifier(max_depth=20), 
               n_estimators= 500, bootstrap=True, random_state=1)))
models.append(('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=9)))
models.append(('GaussianNB', GaussianNB()))
models.append(('SVC', SVC()))
models.append(('XGBClassifier', XGBClassifier()))
models.append(('LogisticRegression', LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter = 1000)))

In [None]:
# fit and predict the train dataset
for name, model in models:
    print(name)
    model.fit(x_train, y_train)
    prediction = model.predict(x_test)
    print(confusion_matrix(y_test, prediction))
    print(accuracy_score(y_test, prediction))

    disp = plot_confusion_matrix(model, x_test, y_test,
                                 display_labels=class_names,
                                 cmap=plt.cm.Blues,
                                 normalize='true')
    plt.show()
    print('\n')

from the result above we can see that Random Forest Classifier gain the highest accuracy, so we will use it to predict our test data

In [None]:
# encode and scaling the test data 
for column in cat_features:
    test[column] = le.fit_transform(test[column])
test = scaler.transform(test)

In [None]:
# making prediction for the test dataset
predictions = models[0][1].predict(test)

In [None]:
# creating new dataset that contain only id and the label
test = pd.read_csv('702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv')
submission = test[['id']]
submission['status_group'] = predictions
submission.tail()

In [None]:
# map the encoded label into original name
submission['status_group'] = submission['status_group'].map({0:'functional', 1:'non functional', 2:'functional needs repair'})

In [None]:
submission.head()

In [None]:
# saving the result
submission.to_csv('submission.csv', index=False)