In [1]:
import numpy as np
import pandas as pd
import re

#For save and load pickle
from joblib import Parallel, delayed
import joblib

#For timestamp
from datetime import datetime

#Library for training
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

## Load Data

In [2]:
#Data KPU
data = pd.read_csv('../Dataset/data-pemilih-kpu.csv')
data = data.dropna()
data

Unnamed: 0,nama,jenis_kelamin
0,ERWIN TJAHJONO,Laki-Laki
1,DAVIANDRIE ANDIKA BAHROENY,Laki-Laki
2,ELAN KURNIAWAN,Laki-Laki
3,AYU DWI CAHYANING MUKTI,Perempuan
4,WAHYOEDIN,Laki-Laki
...,...,...
13132,HERMANSYAH,Laki-Laki
13133,SITA.HJ,Perempuan
13134,MASNI TAMBUNAN,Perempuan
13135,MARJANEDI,Laki-Laki


In [3]:
#Data Brand
with open('../Dataset/brand.txt','r') as f:
    brand = f.read().lower().splitlines()
df_brand = pd.DataFrame(data={'nama':brand})
df_brand['jenis_kelamin'] = 'Lainnya'
df_brand

Unnamed: 0,nama,jenis_kelamin
0,axe,Lainnya
1,casablanca,Lainnya
2,gatsby splash cologne,Lainnya
3,bellagio,Lainnya
4,eskulin cologne gel,Lainnya
...,...,...
2278,asturo,Lainnya
2279,crayola,Lainnya
2280,boxy,Lainnya
2281,mirage,Lainnya


## Preprocessing

In [4]:
df_concat = pd.concat([data,df_brand],ignore_index=True)
df_concat['nama'] = df_concat['nama'].apply(lambda x: re.sub("[^a-zA-Z]", " ", x)) #Only use alphabet
df_concat['nama'] = df_concat['nama'].apply(lambda x: str(x).lower()) #Only lowercase
df_concat['jenis_kelamin'].value_counts()

Perempuan    6788
Laki-Laki    6162
Lainnya      2283
Name: jenis_kelamin, dtype: int64

## Train Test Split

In [5]:
x = df_concat['nama']
y = df_concat['jenis_kelamin']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)
y_train.value_counts()

Perempuan    5367
Laki-Laki    4945
Lainnya      1874
Name: jenis_kelamin, dtype: int64

## SVM Model

In [6]:
print(datetime.now().strftime("%A %d %B %Y %H:%M:%S"))

from sklearn.metrics import accuracy_score
objs = [("vect", CountVectorizer(analyzer='char',ngram_range=(1, 5))), 
        ("tfidf", TfidfTransformer(use_idf=False)),
        ("svm", SVC(C=3.1622776601683795,gamma=0.31622776601683794,probability=True))]
pipe = Pipeline(objs)
pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
accuracy = 100.0 * accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

print(datetime.now().strftime("%A %d %B %Y %H:%M:%S"))
print(classification_report(y_test, y_pred, target_names=pipe.classes_))

Thursday 11 August 2022 11:13:58
Accuracy:  90.31834591401379
Thursday 11 August 2022 11:16:29
              precision    recall  f1-score   support

     Lainnya       0.86      0.75      0.80       409
   Laki-Laki       0.89      0.92      0.90      1217
   Perempuan       0.92      0.94      0.93      1421

    accuracy                           0.90      3047
   macro avg       0.89      0.87      0.88      3047
weighted avg       0.90      0.90      0.90      3047



In [7]:
y_proba = pipe.predict_proba(x_test.tolist())
acc = pd.DataFrame(y_proba, columns=pipe.classes_)
acc["pred"] = y_pred
acc["actual"] = y_test.tolist()

high_prob = []
for i in range(len(acc)):
    high_prob.append(max(acc[['Lainnya', 'Laki-Laki', 'Perempuan']].iloc[[i]].values.tolist()[0]))
acc["higher_prob"] = high_prob
acc.sort_values(by=["higher_prob"])

confidence = pd.DataFrame()
for i in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    filtered = acc[acc["higher_prob"] < i]
    if not filtered.empty :
        true_value = filtered.apply(lambda x: x.pred == x.actual, axis=1).sum()
        false_value = filtered.apply(lambda x: x.pred != x.actual, axis=1).sum()
        temp_df = pd.DataFrame({'range':[i], 'total_name':[len(filtered)], 'true':[true_value], 'false':[false_value], 
                                        'accuracy': [true_value/(true_value+false_value)]})
        confidence = pd.concat([confidence,temp_df], ignore_index=True, axis=0)
    else:
        temp_df = pd.DataFrame({'range':[i], 'total_name':[len(filtered)], 'true':[0], 'false':[0], 
                                        'accuracy': [0]})
        confidence = pd.concat([confidence,temp_df], ignore_index=True, axis=0)
confidence

Unnamed: 0,range,total_name,true,false,accuracy
0,0.1,0,0,0,0.0
1,0.2,0,0,0,0.0
2,0.3,0,0,0,0.0
3,0.4,4,3,1,0.75
4,0.5,44,17,27,0.386364
5,0.6,171,81,90,0.473684
6,0.7,316,169,147,0.53481
7,0.8,503,310,193,0.616302
8,0.9,798,561,237,0.703008
9,1.0,3047,2752,295,0.903183


In [8]:
print("Confidence keseluruhan {0}".format(sum(confidence['true'])/sum(confidence['total_name'])))

Confidence keseluruhan 0.7972557853778415


## Random Forest Model

In [9]:
print(datetime.now().strftime("%A %d %B %Y %H:%M:%S"))

from sklearn.metrics import accuracy_score
objs = [("vect", CountVectorizer(analyzer='char',ngram_range=(1, 5))), 
        ("tfidf", TfidfTransformer(use_idf=False)),
        ("forest", RandomForestClassifier(random_state=0))]
pipe_f = Pipeline(objs)
pipe_f.fit(x_train,y_train)
y_pred = pipe_f.predict(x_test)
accuracy = 100.0 * accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

print(datetime.now().strftime("%A %d %B %Y %H:%M:%S"))
print(classification_report(y_test, y_pred, target_names=pipe.classes_))

Thursday 11 August 2022 11:21:14
Accuracy:  88.38201509681654
Thursday 11 August 2022 11:21:40
              precision    recall  f1-score   support

     Lainnya       0.82      0.75      0.78       409
   Laki-Laki       0.87      0.90      0.89      1217
   Perempuan       0.91      0.91      0.91      1421

    accuracy                           0.88      3047
   macro avg       0.87      0.85      0.86      3047
weighted avg       0.88      0.88      0.88      3047



In [10]:
y_proba = pipe_f.predict_proba(x_test.tolist())
acc = pd.DataFrame(y_proba, columns=pipe.classes_)
acc["pred"] = y_pred
acc["actual"] = y_test.tolist()

high_prob = []
for i in range(len(acc)):
    high_prob.append(max(acc[['Lainnya', 'Laki-Laki', 'Perempuan']].iloc[[i]].values.tolist()[0]))
acc["higher_prob"] = high_prob
acc.sort_values(by=["higher_prob"])

confidence = pd.DataFrame()
for i in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    filtered = acc[acc["higher_prob"] <= i]
    if not filtered.empty :
        true_value = filtered.apply(lambda x: x.pred == x.actual, axis=1).sum()
        false_value = filtered.apply(lambda x: x.pred != x.actual, axis=1).sum()
        temp_df = pd.DataFrame({'range':[i], 'total_name':[len(filtered)], 'true':[true_value], 'false':[false_value], 
                                        'accuracy': [true_value/(true_value+false_value)]})
        confidence = pd.concat([confidence,temp_df], ignore_index=True, axis=0)
    else:
        temp_df = pd.DataFrame({'range':[i], 'total_name':[len(filtered)], 'true':[0], 'false':[0], 
                                        'accuracy': [0]})
        confidence = pd.concat([confidence,temp_df], ignore_index=True, axis=0)
confidence

Unnamed: 0,range,total_name,true,false,accuracy
0,0.1,0,0,0,0.0
1,0.2,0,0,0,0.0
2,0.3,0,0,0,0.0
3,0.4,40,18,22,0.45
4,0.5,344,192,152,0.55814
5,0.6,773,512,261,0.662354
6,0.7,1229,913,316,0.74288
7,0.8,1762,1424,338,0.808173
8,0.9,2379,2030,349,0.8533
9,1.0,3047,2693,354,0.88382


In [11]:
print("Confidence keseluruhan {0}".format(sum(confidence['true'])/sum(confidence['total_name'])))

Confidence keseluruhan 0.8128264048464592


## Naive Bayes Model

In [12]:
print(datetime.now().strftime("%A %d %B %Y %H:%M:%S"))

from sklearn.metrics import accuracy_score
objs = [("vect", CountVectorizer(analyzer='char',ngram_range=(1, 5))), 
        ("tfidf", TfidfTransformer(use_idf=False)),
        ("NB", BernoulliNB())]
pipe_nb = Pipeline(objs)
pipe_nb.fit(x_train,y_train)
y_pred = pipe_nb.predict(x_test)
accuracy = 100.0 * accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

print(datetime.now().strftime("%A %d %B %Y %H:%M:%S"))
print(classification_report(y_test, y_pred, target_names=pipe.classes_))

Thursday 11 August 2022 11:22:31
Accuracy:  86.24876928126025
Thursday 11 August 2022 11:22:32
              precision    recall  f1-score   support

     Lainnya       0.91      0.55      0.68       409
   Laki-Laki       0.88      0.88      0.88      1217
   Perempuan       0.84      0.94      0.89      1421

    accuracy                           0.86      3047
   macro avg       0.88      0.79      0.82      3047
weighted avg       0.87      0.86      0.86      3047



In [13]:
y_proba = pipe_nb.predict_proba(x_test.tolist())
acc = pd.DataFrame(y_proba, columns=pipe.classes_)
acc["pred"] = y_pred
acc["actual"] = y_test.tolist()

high_prob = []
for i in range(len(acc)):
    high_prob.append(max(acc[['Lainnya', 'Laki-Laki', 'Perempuan']].iloc[[i]].values.tolist()[0]))
acc["higher_prob"] = high_prob
acc.sort_values(by=["higher_prob"])

confidence = pd.DataFrame()
for i in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:
    filtered = acc[acc["higher_prob"] <= i]
    if not filtered.empty :
        true_value = filtered.apply(lambda x: x.pred == x.actual, axis=1).sum()
        false_value = filtered.apply(lambda x: x.pred != x.actual, axis=1).sum()
        temp_df = pd.DataFrame({'range':[i], 'total_name':[len(filtered)], 'true':[true_value], 'false':[false_value], 
                                        'accuracy': [true_value/(true_value+false_value)]})
        confidence = pd.concat([confidence,temp_df], ignore_index=True, axis=0)
    else:
        temp_df = pd.DataFrame({'range':[i], 'total_name':[len(filtered)], 'true':[0], 'false':[0], 
                                        'accuracy': [0]})
        confidence = pd.concat([confidence,temp_df], ignore_index=True, axis=0)
confidence

Unnamed: 0,range,total_name,true,false,accuracy
0,0.1,0,0,0,0.0
1,0.2,0,0,0,0.0
2,0.3,0,0,0,0.0
3,0.4,0,0,0,0.0
4,0.5,9,3,6,0.333333
5,0.6,53,27,26,0.509434
6,0.7,93,45,48,0.483871
7,0.8,155,72,83,0.464516
8,0.9,246,106,140,0.430894
9,1.0,3047,2628,419,0.862488


In [14]:
print("Confidence keseluruhan {0}".format(sum(confidence['true'])/sum(confidence['total_name'])))

Confidence keseluruhan 0.7996114349153484


## Load the Model

In [None]:
file = "../Model/gender_prediction_pipeline_3_gender.pkl" #Change the value 
model = joblib.load(file)

## Training Model Again

In [None]:
x_train , y_train = ___ #Fill the value
model.fit(x_train,y_train)

## Export the Model

In [None]:
file = "../Model/gender_prediction_pipeline_3_gender.pkl" #Change the value
joblib.dump(___, file) #Fill the value