In [33]:
def sigmoid(z):  # 0~1
    return 1 / (1+np.exp(-z))

In [34]:
def loss_func(x, t):
    delta = 1e-7
    z = np.dot(x, W) + b
    y = sigmoid(z)
    return -np.sum(t * np.log(y+delta) + (1-t)*np.log((1 - y) + delta))

In [35]:
def predict(test_data):
    z = np.dot(test_data, W) + b
    y = sigmoid(z)
    if y >= 0.5:
        result = 1
    else:
        result = 0
        
    return y, result

In [36]:
def numerical_derivative(f, x):
    delta_x = 1e-4
    grad = np.zeros_like(x)

    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])

    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + delta_x
        fx1 = f(x)

        x[idx] = float(tmp_val) - delta_x
        fx2 = f(x)

        grad[idx] = (fx1 - fx2) / (2*delta_x)
        x[idx] = tmp_val
        it.iternext()
    return grad

In [37]:
import numpy as np
from datetime import datetime

x_data = np.array([2, 4, 6, 8, 10, 12, 14, 16, 18, 20]).reshape(-1, 1)
t_data = np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1]).reshape(-1, 1)

W = np.random.rand(1, 1)
b = np.random.rand(1)

print("x_data.shape = ", x_data.shape, "\nt_data.shape = ", t_data.shape)
print("W = ", W, ", W.shape = ", W.shape, ", b = ", b, ", b.shape = ", b.shape)

x_data.shape =  (10, 1) 
t_data.shape =  (10, 1)
W =  [[0.72919143]] , W.shape =  (1, 1) , b =  [0.61174039] , b.shape =  (1,)


In [38]:
learning_rate = 1e-2
f = lambda x : loss_func(x_data, t_data)
print("Initial loss value = ", loss_func(x_data, t_data))
for step in range(50001):
    W -= learning_rate * numerical_derivative(f, W)
    b -= learning_rate * numerical_derivative(f, b)
    if (step % 5000 == 0):
        print("step = ", step, "loss value = ", loss_func(x_data, t_data))

Initial loss value =  34.45158634101983
step =  0 loss value =  17.10188341957714
step =  5000 loss value =  0.8456919640701164
step =  10000 loss value =  0.6211750262556567
step =  15000 loss value =  0.5096059683309031
step =  20000 loss value =  0.43786161522349637
step =  25000 loss value =  0.38620850715839294
step =  30000 loss value =  0.3465743941745077
step =  35000 loss value =  0.3148900133656988
step =  40000 loss value =  0.28882318923769584
step =  45000 loss value =  0.2669157182070366
step =  50000 loss value =  0.24819670407825808


In [39]:
test_data = np.array([3.0])
real, logical = predict(test_data)
real, logical

(array([1.41556037e-09]), 0)

In [40]:
test_data = np.array([17.0])
real, logical = predict(test_data)
real, logical

(array([0.99974546]), 1)

In [41]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

dataset = load_breast_cancer()

In [42]:
dataset.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [43]:
print(dataset.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

In [44]:
train = pd.DataFrame(dataset.data, columns = dataset.feature_names)
target = pd.DataFrame(dataset.target, columns = ["cancer"])

In [45]:
train.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [46]:
data = pd.concat([train, target], axis = 1)

In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data[["mean radius"]], data[["cancer"]],
                                                    test_size = 0.25, random_state = 42)

In [49]:
print(X_train)
print(y_train)

     mean radius
287       12.890
512       13.400
402       12.960
446       17.750
210       20.580
..           ...
71         8.888
106       11.640
270       14.290
435       13.980
102       12.180

[426 rows x 1 columns]
     cancer
287       1
512       0
402       1
446       0
210       0
..      ...
71        1
106       1
270       1
435       0
102       1

[426 rows x 1 columns]


In [50]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver = "liblinear")
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [51]:
from sklearn.metrics import accuracy_score

pred = model.predict(X_test)
print("mean radius 만으로 예측한 결과: ", pred)
accuracy_score(y_test, pred)

mean radius 만으로 예측한 결과:  [1 0 0 1 1 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0
 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 0 0 1 1 1 0 1 0
 1 1 1 1 1 1 0 1 1 0 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1 0 1 1 0
 1 1 0 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1]


0.916083916083916

In [56]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], data['cancer'], random_state = 42)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(f"전체 데이터로 예측한 결과 : {score}")

전체 데이터로 예측한 결과 : 0.958041958041958


In [57]:
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.958041958041958

In [59]:
dataset = load_breast_cancer()
train = pd.DataFrame(dataset.data, columns = dataset.feature_names)
target = pd.DataFrame(dataset.target, columns = ["cancer"])
data = pd.concat([train, target], axis = 1)
data.info()

X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1], data['cancer'], random_state = 42)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [64]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [65]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver = "liblinear")
model.fit(X_train_scaled, y_train)
score = model.score(X_test_scaled, y_test)
print(f"전체 데이터로 예측한 결과: {score}")

전체 데이터로 예측한 결과: 0.9790209790209791


In [67]:
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [69]:
feature_names = ["pclass", "age", "sex"]

In [70]:
dfX = df[feature_names].copy()
dfy = df["survived"].copy()

In [71]:
dfX

Unnamed: 0,pclass,age,sex
0,3,22.0,male
1,1,38.0,female
2,3,26.0,female
3,1,35.0,female
4,3,35.0,male
...,...,...,...
886,2,27.0,male
887,1,19.0,female
888,3,,female
889,1,26.0,male


In [73]:
dfy

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: survived, Length: 891, dtype: int64

In [74]:
dfX["sex"] = LabelEncoder().fit_transform(dfX["sex"])

In [75]:
dfX["age"].fillna(dfX["age"].mean(), inplace = True)

In [76]:
dfX.tail()

Unnamed: 0,pclass,age,sex
886,2,27.0,1
887,1,19.0,0
888,3,29.699118,0
889,1,26.0,1
890,3,32.0,1


In [78]:
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size = 0.3, random_state = 1)

log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
log_clf.score(X_test, y_test)

0.7761194029850746