## Overview Data

In [4]:
#import library penting
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
#load dataset
data = pd.read_csv('readyForModeling.csv')
data.head()

Unnamed: 0,Tn,Tx,Tavg,RH_avg,RR,ss,ff_x,ddd_x,ff_avg,ddd_car,cuaca
0,24.3,33.2,27.1,90.0,1.9,2.8,4.0,300.0,1.0,4,2
1,24.3,29.8,26.5,91.0,24.3,2.7,3.0,130.0,0.0,8,2
2,24.0,34.0,27.9,86.0,3.4,0.4,3.0,70.0,1.0,8,2
3,24.2,34.2,28.4,82.0,0.0,4.2,3.0,300.0,1.0,8,1
4,24.2,32.2,27.4,88.0,0.0,7.7,4.0,280.0,1.0,8,1


## EDA | Exploratory Data Analysis

Struktur dan karakteristik dari DataFrame Train dan Test

In [6]:
data2 = data.copy()

In [7]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342 entries, 0 to 341
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Tn       342 non-null    float64
 1   Tx       342 non-null    float64
 2   Tavg     342 non-null    float64
 3   RH_avg   342 non-null    float64
 4   RR       342 non-null    float64
 5   ss       342 non-null    float64
 6   ff_x     342 non-null    float64
 7   ddd_x    342 non-null    float64
 8   ff_avg   342 non-null    float64
 9   ddd_car  342 non-null    int64  
 10  cuaca    342 non-null    int64  
dtypes: float64(9), int64(2)
memory usage: 29.5 KB


In [8]:
data2.isnull().sum()

Tn         0
Tx         0
Tavg       0
RH_avg     0
RR         0
ss         0
ff_x       0
ddd_x      0
ff_avg     0
ddd_car    0
cuaca      0
dtype: int64

In [9]:
data2.head()

Unnamed: 0,Tn,Tx,Tavg,RH_avg,RR,ss,ff_x,ddd_x,ff_avg,ddd_car,cuaca
0,24.3,33.2,27.1,90.0,1.9,2.8,4.0,300.0,1.0,4,2
1,24.3,29.8,26.5,91.0,24.3,2.7,3.0,130.0,0.0,8,2
2,24.0,34.0,27.9,86.0,3.4,0.4,3.0,70.0,1.0,8,2
3,24.2,34.2,28.4,82.0,0.0,4.2,3.0,300.0,1.0,8,1
4,24.2,32.2,27.4,88.0,0.0,7.7,4.0,280.0,1.0,8,1


## Splitting

In [10]:
X = data2.drop('cuaca',axis=1)
y = data2['cuaca']

print(X.shape)
print(y.shape)

(342, 10)
(342,)


In [11]:
X

Unnamed: 0,Tn,Tx,Tavg,RH_avg,RR,ss,ff_x,ddd_x,ff_avg,ddd_car
0,24.3,33.2,27.1,90.0,1.9,2.8,4.0,300.0,1.0,4
1,24.3,29.8,26.5,91.0,24.3,2.7,3.0,130.0,0.0,8
2,24.0,34.0,27.9,86.0,3.4,0.4,3.0,70.0,1.0,8
3,24.2,34.2,28.4,82.0,0.0,4.2,3.0,300.0,1.0,8
4,24.2,32.2,27.4,88.0,0.0,7.7,4.0,280.0,1.0,8
...,...,...,...,...,...,...,...,...,...,...
337,24.7,33.0,27.7,89.0,0.0,8.2,3.0,70.0,1.0,8
338,24.2,32.1,27.1,90.0,21.0,4.5,4.0,330.0,1.0,8
339,24.2,32.5,26.6,90.0,1.3,2.9,5.0,170.0,1.0,8
340,23.6,32.8,26.8,92.0,13.1,3.2,4.0,240.0,1.0,8


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((273, 10), (69, 10), (273,), (69,))

## Training

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [15]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', OneHotEncoder())
])

In [16]:
from sklearn.compose import ColumnTransformer

In [104]:
preprocessor = ColumnTransformer([
    ('numeric', numerical_pipeline, ['Tn', 'Tx', 'Tavg', 'RH_avg', 'RR', 'ss', 'ff_x', 'ddd_x', 'ff_avg']),
    ('categoric', categorical_pipeline, ['AmbilCuti'])
])

## Pipeline

In [105]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [106]:
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', GaussianNB()),
])

In [107]:
pipeline.fit(X_train, y_train)

In [108]:
accuracies_nbc2 = cross_val_score(pipeline, X = X_test, y = y_test, cv = 4)
print("Nilai Akurasi Naive Bayes Classifier 4-Fold: {:.2f}%".format(accuracies_nbc2.mean() * 100))

Nilai Akurasi Naive Bayes Classifier 4-Fold: 91.07%


In [109]:
list_akurasi = list(accuracies_nbc2 * 100)
list_akurasi

[100.0, 78.57142857142857, 85.71428571428571, 100.0]

In [110]:
list_akurasi2 = []
for i in list_akurasi:
    list_akurasi2.append(round(i, 2))

list_akurasi2

[100.0, 78.57, 85.71, 100.0]

In [111]:
np.array(list_akurasi2).mean() # Hasilnya sama

91.07

In [112]:
# Confusion Matrix

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report

y_pred = cross_val_predict(pipeline, X_test, y_test, cv = 4)
conf_mat = confusion_matrix(y_test, y_pred)

In [113]:
conf_mat

array([[28,  4],
       [ 1, 22]], dtype=int64)

In [114]:
(28 + 22) / (28 + 22 + 1 + 4) *100

90.9090909090909

In [115]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.88      0.92        32
           1       0.85      0.96      0.90        23

    accuracy                           0.91        55
   macro avg       0.91      0.92      0.91        55
weighted avg       0.92      0.91      0.91        55



## Testing

In [116]:
data2.columns

Index(['JmlNonAktif', 'AmbilCuti', 'TotalTunggakan', 'IPKAkhir', 'SKSLulus',
       'SemesterTerakhir', 'LabelMahasiswa'],
      dtype='object')

In [117]:
testing = {'JmlNonAktif': [8],
           'AmbilCuti': [0],
           'TotalTunggakan': [0],
           'IPKAkhir': [2.98],
           'SKSLulus': [144],
           'SemesterTerakhir': [10]
          }

testing = pd.DataFrame(testing)
testing

Unnamed: 0,JmlNonAktif,AmbilCuti,TotalTunggakan,IPKAkhir,SKSLulus,SemesterTerakhir
0,8,0,0,2.98,144,10


In [118]:
pred_coba = pipeline.predict(testing)
print('Status Mahasiswa:', pred_coba)

Status Mahasiswa: [1]


## Save Model

In [119]:
import pickle

pickle.dump(pipeline,open('NBC.pkl','wb'))