In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("/kaggle/input/hr-competency-scores-for-screening/dataset.csv")

In [None]:
df.head()

In [None]:
df.shape

# Checking for Class Imbalance

In [None]:
palette_color = sns.color_palette('pastel')
plt.pie(x = df['call_for_interview'].value_counts(),
            labels=df['call_for_interview'].value_counts().index,
            autopct='%.0f%%',
            shadow=True,
            colors= palette_color)

In [None]:
def plots(df, t):
    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    sns.barplot(df, x='call_for_interview', y=t, palette='summer', ax=axes[0])
    for container in axes[0].containers:
        axes[0].bar_label(container, color='black', size=10, padding=10)
        
    sns.histplot(df, x=t, ax=axes[1], kde=True, color='g')
        
    plt.suptitle(t)
        
    plt.show()

# Barplots grouped by call for inteview feature

In [None]:
for i in df.columns[:-1]:
    plots(df, i)

# Broad overview of numerical data using pairplot

In [None]:
sns.pairplot(df, vars=df.columns[:-1], hue='call_for_interview')

# Checking for outliers

In [None]:
for i in df.columns[:-1]:
    sns.boxplot(df, x=i)
    plt.show()

# Removal of detected outliers

In [None]:
def outliers_removal(df, x):
    perc = np.percentile(df[x], [0, 25, 50, 75, 100])
    iqr = perc[3] - perc[1]
    mn = perc[1] - 1.5*iqr
    mx = perc[3] + 1.5*iqr
    df.loc[df[x] < mn, x] = mn
    df.loc[df[x] > mx, x] = mx
    return df

In [None]:
df = outliers_removal(df, 'functional_competency_score')

In [None]:
sns.boxplot(df, x='functional_competency_score')
plt.show()

# Correlation and splitting data for training and testing

In [None]:
corr = df.corr(method='kendall')
sns.heatmap(corr, annot=True)

In [None]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.2)

# Modelling pipeline

In [None]:
def evaluate(model, name, _round=2):
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_pred, y_test)
    acc *= 100
    acc = round(acc, 2)
    print("{}: {}%".format(name, acc))

In [None]:
def training(model, name):
    model.fit(x_train, y_train)
    evaluate(model, name, 2)
    return model

# Models definitions and Hyperparameters tuning

In [None]:
lnr = LogisticRegression()
svc = SVC(C=0.5)
lvc = LinearSVC(C=0.5)
dtc = DecisionTreeClassifier(max_depth=20, criterion='entropy')
rfc = RandomForestClassifier(max_depth=20, n_estimators=100, criterion='entropy')
abc = AdaBoostClassifier(n_estimators=60, learning_rate=0.1)
xgb = XGBClassifier(n_estimators=1000, max_depth=10, eta=0.1, subsample=0.7, colsample_bytree=0.8)
knn = KNeighborsClassifier(n_neighbors=10)
gnb = GaussianNB()

models = [lnr, svc, lvc, dtc, rfc, abc, xgb, knn, gnb]
name = ['Logistic Regression', 'SVC', 'LinearSVC', 'Decision Tree',
        'Random Forest', 'Ada Boost', 'XGBClassifier',
       'KNN', 'Naive Bayes']

# Models training and performance assesment

In [None]:
trained = []
for i, j in zip(models, name):
    trained += [training(i, j)]
    print()