# Logistic Regression  

In [3]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model  import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

# read data
pima = pd.read_csv('./dataset/pima-indians-diabetes.csv')
print(pima.head())

   pregnant  glucose  bp  skin  insulin   bmi  pedigree  age  label
0         6      148  72    35        0  33.6     0.627   50      1
1         1       85  66    29        0  26.6     0.351   31      0
2         8      183  64     0        0  23.3     0.672   32      1
3         1       89  66    23       94  28.1     0.167   21      0
4         0      137  40    35      168  43.1     2.288   33      1


In [6]:
#use 'pregnant','insulin','bmi', 'age' to predict 'label'
df=pima[['pregnant', 'insulin', 'bmi', 'age', 'label']]
X=df[['pregnant', 'insulin', 'bmi', 'age']]
y=df['label']

print(X.head())

   pregnant  insulin   bmi  age
0         6        0  33.6   50
1         1        0  26.6   31
2         8        0  23.3   32
3         1       94  28.1   21
4         0      168  43.1   33


In [7]:
print(y.head())

0    1
1    0
2    1
3    0
4    1
Name: label, dtype: int64


In [8]:
# split data into training data and testing data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1) #random_state 種子值

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

# build model 
model=LogisticRegression()
model.fit(X_train,y_train)

X_test_nor = scaler.transform(X_test)
y_pred = model.predict(X_test_nor)

accuracy = accuracy_score(y_test, y_pred)
con_matrix = confusion_matrix(y_test, y_pred)

# evaluate model
print('accuracy: {}'.format(accuracy))
print('confusion matrix: {}'.format(con_matrix))

accuracy: 0.7056277056277056
confusion matrix: [[128  18]
 [ 50  35]]


# K nearest neighbors

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# load csv file
df = pd.read_csv('./dataset/seeds_dataset.csv', header=None)
print(df.head())

       0      1       2      3      4      5      6  7
0  15.26  14.84  0.8710  5.763  3.312  2.221  5.220  1
1  14.88  14.57  0.8811  5.554  3.333  1.018  4.956  1
2  14.29  14.09  0.9050  5.291  3.337  2.699  4.825  1
3  13.84  13.94  0.8955  5.324  3.379  2.259  4.805  1
4  16.14  14.99  0.9034  5.658  3.562  1.355  5.175  1


In [10]:
X = df[[0, 1, 2, 3, 4, 5, 6]]
y = df[7]-1
print(X.head())

       0      1       2      3      4      5      6
0  15.26  14.84  0.8710  5.763  3.312  2.221  5.220
1  14.88  14.57  0.8811  5.554  3.333  1.018  4.956
2  14.29  14.09  0.9050  5.291  3.337  2.699  4.825
3  13.84  13.94  0.8955  5.324  3.379  2.259  4.805
4  16.14  14.99  0.9034  5.658  3.562  1.355  5.175


In [11]:
print(y.head())

0    0
1    0
2    0
3    0
4    0
Name: 7, dtype: int64


In [12]:
# split data into training data and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

# build model (assume k=5)
model = neighbors.KNeighborsClassifier()
model.fit(X_train, y_train)


X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)

# evaluate model
accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('confusion matrix: {}'.format(con_matrix))

number of correct sample: 40
accuracy: 0.9523809523809523
confusion matrix: [[12  0  1]
 [ 0 13  0]
 [ 1  0 15]]
