In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('PimaIndians.csv')
df.head(2)

Unnamed: 0,pregnant,glucose,diastolic,triceps,insulin,bmi,family,age,test
0,1,89,66,23,94,28.1,0.167,21,negative
1,0,137,40,35,168,43.1,2.288,33,positive


In [3]:
df.iloc[:, 1:]

Unnamed: 0,glucose,diastolic,triceps,insulin,bmi,family,age,test
0,89,66,23,94,28.1,0.167,21,negative
1,137,40,35,168,43.1,2.288,33,positive
2,78,50,32,88,31.0,0.248,26,positive
3,197,70,45,543,30.5,0.158,53,positive
4,189,60,23,846,30.1,0.398,59,positive
...,...,...,...,...,...,...,...,...
387,181,88,44,510,43.3,0.222,26,positive
388,128,88,39,110,36.5,1.057,37,positive
389,88,58,26,16,28.4,0.766,22,negative
390,101,76,48,180,32.9,0.171,63,negative


In [4]:
X = df.iloc[:,:-1]
y = df.iloc[:, -1]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [6]:
scaler = StandardScaler()

In [7]:
lr = LogisticRegression()

### Fit the scaler on the training features and transform

In [8]:
X_train_std = scaler.fit_transform(X_train)

### Fit the logistic regression model on the scaled training data

In [9]:
lr.fit(X_train_std, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Scale the test features

In [10]:
X_test_std = scaler.transform(X_test)

### Predict diabetes presence on the scaled test set

In [11]:
y_pred = lr.predict(X_test_std)

### Calculate accuracy metrics and feature coefficients

In [12]:
print("{0:.1%} accuracy on test set.".format(accuracy_score(y_test, y_pred))) 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

78.6% accuracy on test set.
{'pregnant': 0.16, 'glucose': 1.12, 'diastolic': 0.09, 'triceps': 0.13, 'insulin': 0.06, 'bmi': 0.27, 'family': 0.43, 'age': 0.5}


### Manual Recursive Feature Elimination

In [13]:
X = df[['pregnant', 'glucose', 'triceps', 'insulin', 'bmi', 'family', 'age']]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

### Scales features and fits the logistic regression model

In [15]:
lr.fit(scaler.fit_transform(X_train), y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Calculates the accuracy on the test set and prints coefficients

In [16]:
acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
print("{0:.1%} accuracy on test set.".format(acc)) 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

80.6% accuracy on test set.
{'pregnant': 0.05, 'glucose': 1.24, 'triceps': 0.24, 'insulin': 0.2, 'bmi': 0.39, 'family': 0.34, 'age': 0.35}


### Remove the 2 features with the lowest model coefficients

In [17]:
X = df[['glucose', 'triceps', 'bmi', 'family', 'age']]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [19]:
lr.fit(scaler.fit_transform(X_train), y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
print("{0:.1%} accuracy on test set.".format(acc)) 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

79.6% accuracy on test set.
{'glucose': 1.13, 'triceps': 0.25, 'bmi': 0.34, 'family': 0.34, 'age': 0.37}


### Only keep the feature with the highest coefficient

In [21]:
X = df[['glucose']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
lr.fit(scaler.fit_transform(X_train), y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
print("{0:.1%} accuracy on test set.".format(acc)) 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

75.5% accuracy on test set.
{'glucose': 1.28}
