# Binar Study Case Regression

## Data
Baldness Probability

## About
The dataset contains people profile and their probabilities to baldness.

## Dataset Dictionary

* **age** = Age of people in data
* **gender** = Male or female
* **job_role** = Job roles of people who in data
* **province** = Data of provinces in Indonesia where people come from
* **salary** = salary each month
* **is_married** = Married status (1 = Yes and 0 = No)
* **is_hereditary** = Is the bald based from hereditary? (1 = Yes and 0 = No)
* **weight** = weight of people body in data
* **height** = height of people body in data
* **shampoo** = Shampoo brand which people used in Indonesia
* **is_smoker** = Is the people a smoker? (1 = Yes and 0 = No)
* **education** = Education level of people
* **stress** = Stress level of people in range 1 (lower) to 10 (highest)
* **bald_prob** = Probability score of bald which occur to people in range 0 to 1. The higher probability score means the * **higher** probability to baldness occurred and vice versa.

In [12]:
# Import Library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import (
    LabelEncoder,
    OrdinalEncoder,
    OneHotEncoder,
    MinMaxScaler,
    StandardScaler,
)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [4]:
df = pd.read_csv('/content/Case Study Chapter 3 Topic 3_ Dataset Kebotakan (Not Clean).csv')
df.head()

Unnamed: 0,umur,jenis_kelamin,pekerjaan,provinsi,gaji,is_menikah,is_keturunan,berat,tinggi,sampo,is_merokok,pendidikan,stress,botak_prob
0,27.0,Perempuan,PNS,Bengkulu,7957453.0,1.0,0.0,54.315053,170.428542,Pantone,1.0,S1,5.0,0.605974
1,53.0,Perempuan,PNS,Bandung,7633003.0,1.0,0.0,72.873404,165.530097,Pantone,0.0,S1,7.0,0.53286
2,37.0,Perempuan,Pegawai swasta,Bandung,6637625.0,1.0,0.0,46.321533,154.599388,Moonsilk,0.0,S1,4.0,0.418442
3,36.0,Perempuan,Pengangguran,Palu,3624871.0,1.0,0.0,51.539781,167.340481,Deadbuoy,1.0,SD,9.0,0.80405
4,38.0,Laki-laki,Freelance,Palangkaraya,6031808.0,1.0,0.0,60.726909,165.514773,Merpati,1.0,S2,1.0,0.368371


In [26]:
X = df.drop(columns='botak_prob')
y = df['botak_prob']

botak_train_X, botak_test_X, botak_train_y, botak_test_y = train_test_split(
    X, y, test_size=0.2, random_state=0
)
# Print Training test composition
print(f"Training X shape: {botak_train_X.shape}")
print(f"Testing X shape: {botak_test_X.shape}")

Training X shape: (6333, 13)
Testing X shape: (1584, 13)


In [27]:
mean_minmax_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])
mean_imputer_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
])
median_stdscaler_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
numeric_constant_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
])
categorical_constant_labelencoder_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='TidakTahu')),
    ('encoder', OrdinalEncoder())
])
categorical_constant_onehotencoder_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='TidakTahu')),
    ('encoder', OneHotEncoder())
])
preprocessor_pipeline = ColumnTransformer([
    ('mean_minmax', mean_minmax_pipeline, ['umur']),
    ('mean_imputer', mean_imputer_pipeline, ['stress']),
    ('median_stdscaler', median_stdscaler_pipeline, ['gaji','berat', 'tinggi']),
    ('numeric_constant', numeric_constant_pipeline, ['is_menikah', 'is_keturunan', 'is_merokok']),
    ('categorical_constant_labelencoder', categorical_constant_labelencoder_pipeline, ['pekerjaan', 'provinsi', 'sampo', 'pendidikan']),
    ('categorical_constant_onehotencoder', categorical_constant_onehotencoder_pipeline, ['jenis_kelamin']),
])

In [28]:
# Fit Pipeline
preprocessor_pipeline.fit(botak_train_X)

# Transform Data
botak_train_X_preprocessed = preprocessor_pipeline.transform(botak_train_X)
botak_test_X_preprocessed = preprocessor_pipeline.transform(botak_test_X)

In [29]:
preprocessor_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])

In [30]:
# Fit Pipeline
preprocessor_pipeline.fit(botak_train_X)
label_botak_pipeline.fit(botak_train_y.values.reshape(-1, 1))

# Transform Data
botak_train_X_preprocessed = preprocessor_pipeline.transform(botak_train_X)
botak_test_X_preprocessed = preprocessor_pipeline.transform(botak_test_X)
botak_train_y_preprocessed = label_botak_pipeline.transform(botak_train_y.values.reshape(-1, 1)).flatten()

ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'Laki-laki'

In [16]:
label_botak_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])

In [17]:
with open('botak_preprocessor_pipeline.pkl', 'wb') as file:
    pickle.dump(preprocessor_pipeline, file)

NameError: name 'pickle' is not defined

In [None]:
linreg = LinearRegression(fit_intercept=True)
svm = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)

# Fit Model
linreg.fit(botak_train_X_preprocessed, botak_train_y_preprocessed)
svm.fit(botak_train_X_preprocessed, botak_train_y_preprocessed)

In [None]:
# Predict
linreg_test_y_pred = linreg.predict(botak_test_X_preprocessed)
svr_test_y_pred = svm.predict(botak_test_X_preprocessed)

print(f"R2 Score Linear Regression: {r2_score(botak_test_y_preprocessed, linreg_test_y_pred)}")
print(f"Mean Squared Error Linear Regression: {mean_squared_error(botak_test_y_preprocessed, linreg_test_y_pred)}")
print(f"Mean Absolute Error Linear Regression: {mean_absolute_error(botak_test_y_preprocessed, linreg_test_y_pred)}")
print("=======")
print(f"R2 Score SVR: {r2_score(botak_test_y_preprocessed, svr_test_y_pred)}")
print(f"Mean Squared Error SVR: {mean_squared_error(botak_test_y_preprocessed, svr_test_y_pred)}")
print(f"Mean Absolute Error SVR: {mean_absolute_error(botak_test_y_preprocessed, svr_test_y_pred)}")

In [None]:
with open('botak_linreg.pkl', 'wb') as file:
    pickle.dump(linreg, file)