# Logistic Regression
Source: [Logistic Regression](https://towardsdatascience.com/real-world-implementation-of-logistic-regression-5136cefb8125)

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from path import Path

In [6]:
data = Path("full_numeric_table.csv")
df = pd.read_csv(data)
df.head()

Unnamed: 0,patient_no,gender_numeric,age,age_range,hypertension_numeric,heart_disease_numeric,smoking_history_numeric,bmi,hba1c_level,blood_glucose_level,diabetes_numeric
0,1,0,80.0,5,0,1,0,25.19,6.6,140,0
1,3,1,28.0,2,0,0,0,27.32,5.7,158,0
2,4,0,36.0,3,0,0,1,23.45,5.0,155,0
3,5,1,76.0,5,1,1,1,20.14,4.8,155,0
4,6,0,20.0,1,0,0,0,27.32,6.6,85,0


In [7]:
X = df.drop(columns="diabetes_numeric")
y = df["diabetes_numeric"]

In [8]:
X.describe()

Unnamed: 0,patient_no,gender_numeric,age,age_range,hypertension_numeric,heart_disease_numeric,smoking_history_numeric,bmi,hba1c_level,blood_glucose_level
count,14092.0,14092.0,14092.0,14092.0,14092.0,14092.0,14092.0,14092.0,14092.0,14092.0
mean,27869.059679,0.430457,53.04019,3.834445,0.171445,0.091825,0.418819,30.110906,6.158487,163.220551
std,29966.219709,0.495158,18.794395,1.464959,0.376911,0.288789,0.493383,7.245121,1.285076,56.821153
min,1.0,0.0,0.16,0.0,0.0,0.0,0.0,10.77,3.5,80.0
25%,5482.75,0.0,40.0,3.0,0.0,0.0,0.0,26.09,5.7,130.0
50%,10894.5,0.0,55.0,4.0,0.0,0.0,0.0,27.8,6.1,155.0
75%,49159.25,1.0,68.0,5.0,0.0,0.0,1.0,33.6,6.6,200.0
max,99980.0,1.0,80.0,5.0,1.0,1.0,1.0,88.72,9.0,300.0


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
print(f"Labels: {y[:10]}")
print(f"Data: {X[:10]}")

Labels: 0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: diabetes_numeric, dtype: int64
Data:    patient_no  gender_numeric   age  age_range  hypertension_numeric  \
0           1               0  80.0          5                     0   
1           3               1  28.0          2                     0   
2           4               0  36.0          3                     0   
3           5               1  76.0          5                     1   
4           6               0  20.0          1                     0   
5           9               1  42.0          3                     0   
6          10               0  32.0          2                     0   
7          11               0  53.0          4                     0   
8          12               0  54.0          4                     0   
9          13               0  78.0          5                     0   

   heart_disease_numeric  smoking_history_numeric    bmi  hba1c_level  \
0            

In [13]:
#Create a logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=1000,
                               random_state=1)

In [14]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=1)

In [15]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                  intercept_scaling=1, l1_ratio=None, max_iter=1000,
                  multi_class='warn',n_jobs=None, penalty='l2',
                  random_state=1, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

LogisticRegression(max_iter=1000, multi_class='warn', random_state=1)

In [17]:
#predictions = classifier.predict(X_test)
#pd.DataFrame({"Prediction": predictions, "Actual": y_test})
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,0,0
2,1,1
3,0,0
4,0,0
5,1,1
6,1,1
7,0,0
8,0,0
9,1,1


In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.9602611410729492

In [19]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9588418961112688
Testing Data Score: 0.9602611410729492


In [21]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
12193,1,1
2377,0,0
7364,1,1
3006,0,0
3830,0,0
...,...,...
9943,1,1
9198,1,1
10927,1,1
4116,0,0


In [23]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9602611410729492


In [25]:
from sklearn.metrics import confusion_matrix, classification_report
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[1715   50]
 [  90 1668]]


In [26]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.95      0.97      0.96      1765
           1       0.97      0.95      0.96      1758

    accuracy                           0.96      3523
   macro avg       0.96      0.96      0.96      3523
weighted avg       0.96      0.96      0.96      3523

