In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
data=pd.read_csv('../input/indian_liver_patient.csv')

In [None]:
disease, no_disease = data['Dataset'].value_counts()
male, female = data['Gender'].value_counts()

info=['Diognised with Liver Disease', 'Not Diognised with Liver Disease', 'Male', 'Female']
count=[disease, no_disease, male, female]

df_patient=pd.DataFrame({'Patient Info': info, 'Count': count})

In [None]:
data.info()

In [None]:
data.head().T

In [None]:
df_patient

In [None]:
data.describe().T

In [None]:
data['Albumin_and_Globulin_Ratio'].fillna(data['Albumin_and_Globulin_Ratio'].median(), inplace=True)

In [None]:
target=data['Dataset']
sex=pd.get_dummies(data['Gender'])
#data = data.join(sex)
data.insert(loc=0, column='Male', value=sex['Male'])
data.insert(loc=0, column='Female', value=sex['Female'])
data.drop(['Gender'], axis=1, inplace=True)
#data.drop(['Dataset'], axis=1, inplace=True)

In [None]:
cols = data.columns
cols = list(set(cols))
del cols[cols.index('Dataset')]
#data.hist(column=cols, bins=10, figsize=(20,20), xlabelsize = 7, color='green', log=True)
del cols[cols.index('Male')]
del cols[cols.index('Female')]

In [None]:
def plot_data(cols, data, plot_type):

    fig = plt.figure(figsize = (25,25))
    
    for idx, val in enumerate(cols):
            
        plt.subplot(3, 3, idx+1)

        if plot_type == 'hist':
            disease = 'sns.distplot(data[data["Dataset"] == 1].' + val + ', color="blue", label="Liver disease")'
            healthy = 'sns.distplot(data[data["Dataset"] == 2].' + val + ', color="orange", label="Healthy liver")'
            exec (disease)
            exec (healthy)
            plt.legend()
            plt.xlabel(val)
            plt.ylabel("Frequency")
          
        if plot_type == 'cdf':
            a='plt.hist(data[data["Dataset"] == 1].' + val + ',bins=50,fc=(0,1,0,0.5),label="Bening",normed = True,cumulative = True)'
            exec (a)
            sorted_data = exec('np.sort(data[data["Dataset"] == 1].' + val + ')')
            #sorted_data = exec (sorted_d)
            y = np.arange(len(sorted_data))/float(len(sorted_data)-1)
            plt.plot(sorted_data,y,color='red')
            plt.title('CDF of liver dicease bilirubin')
            
        if plot_type == 'swarm':
            condition = 'sns.swarmplot(x=' +  "'" + 'Dataset' + "'" + ',y=' + "'" + val + "'" + ',data=data)'
            print (condition)
            exec (condition)
              
        if plot_type == 'box':
            condition = 'sns.boxplot(x=' +  "'" + 'Dataset' + "'" + ',y=' + "'" + val + "'" + ',data=data)'
            print (condition)
            exec (condition)
            
        if plot_type == 'violin':
            condition = 'sns.violinplot(x=' +  "'" + 'Dataset' + "'" + ',y=' + "'" + val + "'" + ',data=data)'
            print (condition)
            exec (condition)
        
    return 0

In [None]:
plot_data(cols, data, 'hist')

In [None]:
plot_data(cols, data, 'swarm')

In [None]:
plot_data(cols, data, 'box')

In [None]:
plt.hist(data[data["Dataset"] == 1].Total_Bilirubin,bins=50,fc=(0,1,0,0.5),label='Bening',normed = True,cumulative = True)
sorted_data = np.sort(data[data["Dataset"] == 1].Total_Bilirubin)
y = np.arange(len(sorted_data))/float(len(sorted_data)-1)
plt.plot(sorted_data,y,color='red')
plt.title('CDF of liver dicease bilirubin')
plt.show()

In [None]:
data_healthy = data[data["Dataset"] == 1]
data_disease = data[data["Dataset"] == 2]

Q0 = data_healthy.quantile(0.0025)
Q1 = data_healthy.quantile(0.25)
Q2 = data_healthy.quantile(0.50)
Q3 = data_healthy.quantile(0.75)
Q4 = data_healthy.quantile(0.975)
#IQR = Q3 - Q1

Q=[Q0, Q1, Q2, Q3, Q4]

Q

In [None]:
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

data=data.astype(np.float64)
scaler=MinMaxScaler()
#data[data.columns] = scaler.fit_transform(data[data.columns]) 

ks = range(1, 12)
inertias = []

for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)
    
    # Fit model to samples
    model.fit(data.values)
    
    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()


In [None]:
# Create feature and target arrays
#y = data.Dataset
#data.drop(['Dataset'], axis=1, inplace=True)
X = data.values

reg=KNeighborsClassifier(n_neighbors=4)

# Create pipeline: pipeline
pipeline = make_pipeline(scaler,reg)

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42, stratify=y)

# Fit the classifier to the training data
reg.fit(X_train, y_train)

# Print the accuracy
print(reg.score(X_test, y_test))

In [None]:
cvscores_10 = cross_val_score(reg, X, y, cv = 10)
print(np.mean(cvscores_10))

In [None]:
num_cols = data._get_numeric_data().columns
cor = data[num_cols].corr()

threshold = 0.7

corlist = []

for i in range(0,len(num_cols)):
    for j in range(i+1,len(num_cols)):
        if (j != i and cor.iloc[i,j] <= 1 ) or (j != i and cor.iloc[i,j] >= -1):
            corlist.append([cor.iloc[i,j],i,j]) 

#Sort higher correlations first            
sort_corlist = sorted(corlist,key=lambda x: -abs(x[0]))

#Print correlations and column names
for x,i,j in sort_corlist:
    if num_cols[i] != 'Dataset' and num_cols[j] == 'Dataset':
        print (num_cols[i], num_cols[j], x)

In [None]:
x_plot=[]
y_plot=[]
for x,i,j in sort_corlist:
    if num_cols[i] != 'Dataset' and num_cols[j] != 'Dataset':
        print (num_cols[i],num_cols[j],x)
        x_plot.append(num_cols[i])
        y_plot.append(num_cols[j])

In [None]:
#del x_plot[0]
#del y_plot[0]
#lst=x_plot+y_plot
#lst.append('Dataset')
#sns.pairplot(data[lst], hue="Dataset")

fig, axes = plt.subplots(nrows=7, ncols=7, figsize=(20, 15))
axes=axes.flatten()
for i in range(len(x_plot)):
    sns.scatterplot(data=data, x=x_plot[i], y=y_plot[i], ax=axes[i], hue='Dataset')



In [None]:
data=data.astype(np.float64)
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
data[data.columns] = scaler.fit_transform(data[data.columns]) 
data.head() 

In [None]:
plt.hist(data['Total_Bilirubin'])

#data['Total_Bilirubin'] = np.log1p(data['Total_Bilirubin'])
plt.hist(data['Total_Bilirubin'])

