Predicting the incidence of diabetes in people between 21 and 81 years old with logistic regression algorithm

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
plt.rc("font", size=14)
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
data = pd.read_csv('diabetes.csv')
data = data.dropna()
print(data.shape)
data.head()

In [None]:
data['Outcome'].value_counts()

In [None]:
sns.countplot(x='Outcome', data=data, palette='hls')
plt.show()

In [None]:
count_no = len(data[data['Outcome'] == 0])
count_yes = len(data[data['Outcome'] == 1])

pct_of_no = count_no / (count_no + count_yes)
pct_of_yes = count_yes / (count_no + count_yes)

print('Percentage of healthy people: ', pct_of_no*100)
print('Percentage of sick people: ', pct_of_yes*100)

In [None]:
data.describe()

In [None]:
data.groupby('Outcome').mean()

In [None]:
# Visualization
%matplotlib inline
data.Glucose.hist()
plt.title("Histogram of Glucose")
plt.xlabel("Glucose")
plt.ylabel("Frequency")

In [12]:
cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
x = data[cols]
y = data['Outcome']

In [13]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.3)

In [None]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix: \n', cm)

In [None]:
prfs = precision_recall_fscore_support(y_test, y_pred)
print('Precision: \n', prfs[0])
print('Recall: \n', prfs[1])
print('F-Score: \n', prfs[2])
print('Support: \n', prfs[3])