Use the diabetes data set from UCI and Pima Indians Diabetes data set for performing the
following:

a. Univariate analysis: Frequency, Mean, Median, Mode, Variance, Standard
Deviation, Skewness and Kurtosis

b. Bivariate analysis: Linear and logistic regression
modeling

c. Multiple Regression analysis

d. Also compare the results of the above analysis for
the two data sets


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report

from scipy.stats import skew, kurtosis
import statsmodels.api as sm   #OLS
from sklearn.linear_model import LinearRegression, LogisticRegression

# Load the dataset
uci = pd.read_csv("uci.csv")
pima = pd.read_csv("pima.csv")

In [4]:
uci


Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,Male,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,Male,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,Male,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,Male,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,Female,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1
516,48,Female,1,1,1,1,1,0,0,1,1,1,1,0,0,0,1
517,58,Female,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1
518,32,Female,0,0,0,1,0,0,1,1,0,1,0,0,1,0,0


In [5]:
pima

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [6]:
# Univariate analysis

# Calculate statistics
uci_statistics = uci.describe()
uci_skewness = uci.skew()
uci_kurtosis = uci.kurtosis()

pima_statistics = pima.describe()
pima_skewness = pima.skew()
pima_kurtosis = pima.kurtosis()

#print all above variables

  uci_skewness = uci.skew()
  uci_kurtosis = uci.kurtosis()


In [7]:
#Bivariate analysis
'''For UCI'''
uci_X = uci[["sudden weight loss", "Obesity"]]
uci_y = uci["class"]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(uci_X, uci_y, test_size=0.2, random_state=42)

# Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Linear Regression RMSE:", rmse)

# Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", accuracy)


'''For PIMA'''
pima_X = pima[["BloodPressure", "BMI"]]
pima_y = pima["Outcome"]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(pima_X, pima_y, test_size=0.2, random_state=42)

# Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Linear Regression RMSE:", rmse)

# Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", accuracy)


Linear Regression RMSE: 0.44507664478143916
Logistic Regression Accuracy: 0.6634615384615384
Linear Regression RMSE: 0.46703602274686407
Logistic Regression Accuracy: 0.6558441558441559


In [8]:
# Multiple Regression Analysis
# Ordinary least squares (OLS) regression is a statistical method that
# estimates the relationship between one or more independent variables
# and a dependent variable.
# Basically it is a regression startegy for Multivariate

x_uci = uci[["Polyuria","Polydipsia","visual blurring"]]
y_uci = uci["class"]
x_uci = sm.add_constant(x_uci)
model_uci = sm.OLS(y_uci, x_uci).fit()
uci_summary = model_uci.summary()

x_pima = pima[["Glucose","Insulin","DiabetesPedigreeFunction"]]
y_pima = pima["Outcome"]
x_pima = sm.add_constant(x_pima)
model_pima = sm.OLS(y_pima, x_pima).fit()
pima_summary = model_pima.summary()

print(uci_summary)
print(pima_summary)

                            OLS Regression Results                            
Dep. Variable:                  class   R-squared:                       0.541
Model:                            OLS   Adj. R-squared:                  0.539
Method:                 Least Squares   F-statistic:                     203.1
Date:                Mon, 06 Nov 2023   Prob (F-statistic):           5.94e-87
Time:                        07:21:56   Log-Likelihood:                -160.49
No. Observations:                 520   AIC:                             329.0
Df Residuals:                     516   BIC:                             346.0
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               0.2292      0.023     