In [146]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [147]:
test_df = pd.read_csv('/kaggle/input/dataset/data_subjects_info.csv')
test_df.head()

In [148]:
test_df.shape

In [149]:
test_df.dtypes

In [150]:
test_df.info

In [151]:
test_df.isna().values.any()

In [152]:
test_df[test_df.duplicated()]

In [153]:
test_df.duplicated().value_counts()

In [154]:
test_df.describe()

In [155]:
test_df.corr

In [156]:
import matplotlib.pyplot as plt # visualisasi data
import seaborn as sns # visualisasi data

# output dari visualisasi data akan diarahkan ke notebook
%matplotlib inline 

In [157]:
sns.heatmap(data=test_df.corr())

In [158]:
test_df['code'].value_counts()

In [159]:
test_df['code'].value_counts().plot.bar()
plt.tight_layout()
plt.show()

In [160]:
sns.countplot(data=test_df, x='code')
plt.tight_layout()

In [161]:
test_df['code'].value_counts().plot.pie(autopct='%1.1f%%', labels=None, legend=True)
plt.tight_layout()

In [162]:
fig,ax = plt.subplots(nrows=2, ncols=2, figsize=(8,8))

test_df['weight'].plot.line(ax=ax[0][0])
ax[0][0].set_title('weight')

test_df['height'].plot.line(ax=ax[0][1])
ax[0][1].set_title('height')

test_df.age.plot.line(ax=ax[1][0])
ax[1][0].set_title('age')

test_df.gender.plot.line(ax=ax[1][1])
ax[1][1].set_title('gender')

In [163]:
test_df.plot()
plt.tight_layout()

In [164]:
test_df.hist(figsize=(6,6), bins=10)
plt.tight_layout()

In [165]:
test_df.boxplot()
plt.tight_layout()

In [166]:
test_df.boxplot(by="code", figsize=(8,8))
plt.tight_layout()

In [167]:
sns.scatterplot(x='age', y='gender', data=test_df, hue='code')
plt.tight_layout()

In [168]:
sns.pairplot(test_df, hue='code', markers='+')
plt.tight_layout()

In [169]:
sns.violinplot(data=test_df, y='code', x='gender', inner='quartile')
plt.tight_layout()

In [170]:
from sklearn.model_selection import train_test_split # pembagi dataset menjadi training dan testing set
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report # evaluasi performa model

In [171]:
X = test_df.drop(columns='gender') # menempatkan features ke dalam variable X
X.head() # tampilkan 5 baris pertama

In [172]:
y = test_df['gender'] # menempatkan class label (target) ke dalam variabel y
y.head() # tampilkan 5 baris pertama

In [173]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=10)

print('training dataset')
print(X_train.shape)
print(y_train.shape)
print()
print('testing dataset:')
print(X_test.shape)
print(y_test.shape)

**KKN**

In [174]:
from sklearn.neighbors import KNeighborsClassifier

In [175]:
k_range = list(range(1,15))
scores = []
for k in k_range:
    model_knn = KNeighborsClassifier(n_neighbors=k) # konfigurasi algoritma
    model_knn.fit(X_train, y_train) # training model/classifier
    y_pred = model_knn.predict(X_test) # melakukan prediksi
    scores.append(accuracy_score(y_test, y_pred)) # evaluasi performa

In [176]:
plt.plot(k_range, scores)
plt.xlabel('Value of k for KNN')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Values of k of k-Nearest-Neighbors')
plt.tight_layout()
plt.show()

In [177]:
model_knn = KNeighborsClassifier(n_neighbors=3) # konfigurasi algoritma
model_knn.fit(X_train,y_train) # training model/classifier
y_pred = model_knn.predict(X_test) # melakukan prediksi

In [178]:
print(accuracy_score(y_test, y_pred)) # evaluasi akurasi

**Confusion Matrix**

In [179]:
print(confusion_matrix(y_test, y_pred)) # evaluasi confusion matrix

Clasification Report

In [180]:
print(classification_report(y_test, y_pred))

**Support Vector Classifier**

In [181]:
from sklearn.svm import SVC

In [182]:
# model_svc = SVC()
model_svc = SVC(gamma='scale')
model_svc.fit(X_train,y_train)
y_pred = model_svc.predict(X_test)

In [183]:
# Decicion Tree

from sklearn.tree import DecisionTreeClassifier

In [184]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train,y_train)
y_pred = model_dt.predict(X_test)

In [185]:
#RFC

from sklearn.ensemble import RandomForestClassifier

In [186]:
# model_rf = RandomForestClassifier()
model_rf = RandomForestClassifier(n_estimators=100)
model_rf.fit(X_train,y_train)
pred_rf = model_rf.predict(X_test)

In [187]:
models = [model_knn, model_svc, model_dt, model_rf]
accuracy_scores = []
for model in models:
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    
print(accuracy_scores)

In [188]:
plt.bar(['KNN', 'SVC', 'DT', 'RF'],accuracy_scores)
plt.ylim(0.90,1.01)
plt.title('Accuracy comparision for various models', fontsize=15, color='r')
plt.xlabel('Models', fontsize=18, color='g')
plt.ylabel('Accuracy Score', fontsize=18, color='g')
plt.tight_layout()
plt.show()