# Data Processing

In [None]:
#importing the required libraries
import pandas as pd #Data-Extraction,Manupilation
import numpy as np #Numerical-task
import matplotlib.pyplot as plt #Data-Visulization
import seaborn as sns#Data-Visulization
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('kidney_disease.csv')#Reading the data
df.head()#Printing the entire data


In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
columns=pd.read_csv('data_description.txt',sep='-')#Data-Description(for variables)
columns=columns.reset_index()#Indexing

In [None]:
columns.columns=['cols','abb_col_names'] #Columns Renaming
columns

In [None]:
df.head()

In [None]:
columns['abb_col_names'].values

In [None]:
df.columns=columns['abb_col_names'].values #Data Renaming(Columns)


In [None]:
df.head()

In [None]:
df.dtypes#For Knowing the data-types

In [None]:
def convert_dtype(df,feature):#Converting the object data types to numeric
    df[feature]=pd.to_numeric(df[feature],errors='coerce')#Handling NaN values(Shift+tab)

In [None]:
features=['packed cell volume','white blood cell count','red blood cell count']

for feature in features:#Converting the required variables to numeric
    convert_dtype(df,feature)

In [None]:
df.dtypes

In [None]:
df.drop('id',axis=1,inplace=True)#as id section is of no use we are droping it in vertical axis

In [None]:
df.tail()

# Data Cleaning

In [None]:
df.head()

In [None]:
#Differentiating between Categorical and Numerical Values
def extract_cat_num(df):
    cat_col=[col for col in df.columns if df[col].dtype=='object']#Categorical values
    num_col=[col for col in df.columns if df[col].dtype!='object']#Numerical values
    return cat_col,num_col

In [None]:
cat_col,num_col=extract_cat_num(df)

In [None]:
cat_col

In [None]:
num_col

In [None]:
#findng unique categories
for col in cat_col:
    print('{} has {} values'.format(col,df[col].unique()))
    print('\n')

In [None]:
#replacing all the misclassified to correct data
df['diabetes mellitus'].replace(to_replace={'\tno':'no','\tyes':'yes'},inplace=True)

df['coronary artery disease']=df['coronary artery disease'].replace(to_replace='\tno',value='no')

df['class']=df['class'].replace(to_replace='ckd\t',value='ckd')

In [None]:
for col in cat_col:
    print('{} has {} values'.format(col,df[col].unique()))
    print('\n')

# Analysis Distribution of Data

In [None]:
len(num_col)

In [None]:
#Distribution of numerical columns in form of subplots
plt.figure(figsize=(30,20))

for i,feature in enumerate(num_col):
    plt.subplot(5,3,i+1)#As the length of numerical columns is 14
    df[feature].hist()
    plt.title(feature)

# Label Distribution of Categorical Data

In [None]:
#ckd,notckd

In [None]:
len(cat_col)

In [None]:
plt.figure(figsize=(20,30))

for i,feature in enumerate(cat_col):
  plt.subplot(4,3,i+1)
  sns.countplot(df[feature])

In [None]:
sns.countplot(df['class'])

# Checking co-relation in data

In [None]:
plt.figure(figsize=(10,8))
df.corr()
sns.heatmap(df.corr(),annot=True)#Visulizing the Values

In [None]:
#Grouping on basis of RBC's
df.groupby(['red blood cells','class'])['red blood cell count'].agg(['count','mean','median','min','max'])

In [None]:
!pip install plotly

In [None]:
#impact on target feature
import plotly.express as px

In [None]:
df.columns

In [None]:
px.violin(df,y='red blood cell count',x='class',color='class')

In [None]:
#Relationship between Hemoglobin and packed cell volume
df.columns

In [None]:
px.scatter(df,x='haemoglobin',y='packed cell volume')

# Linear Trend can be seen in above graph

# RBC count in chronic as well as in non-chronic

In [None]:
grid=sns.FacetGrid(df,hue='class',aspect=2)#using class as parameter for the graph
grid.map(sns.kdeplot,'red blood cell count')#featuring rbc's
grid.add_legend()

# Automate the Analysis

In [None]:
#generating the graph for every columns by taking the whole class as parameters
def violin(col):
    fig=px.violin(df,y=col,x='class',color='class',box=True)
    return fig.show()

In [None]:
def scatters(col1,col2):
    fig=px.scatter(df,x=col1,y=col2,color='class')
    return fig.show()

In [None]:
def kde_plot(feature):
    grid=sns.FacetGrid(df,hue='class',aspect=2)
    grid.map(sns.kdeplot,feature)
    grid.add_legend()

In [None]:
kde_plot('red blood cell count')

# Performing EDA on Data

In [None]:
df.columns

In [None]:
kde_plot('haemoglobin')

#From above graph we can conclude that a person who doesn't have chronic kidney disease hav a higher range of haemoglobin 

In [None]:
scatters('red blood cell count','packed cell volume')

In [None]:
scatters('red blood cell count','haemoglobin')

In [None]:
scatters( 'packed cell volume','haemoglobin')

According to abov graph we can say that if a person is having high haemoglobin level with increasing packed cell volume than person has got high chances of having chronic disease

In [None]:
violin('red blood cell count')

In [None]:
violin('packed cell volume')

If person having person having RBC's count between 35-56 he got less chances having chronic kidney disease

In [None]:
scatters('red blood cell count','albumin')

Albumin level above 0 effect the chronic disease

# Treatment of Missing Values

In [None]:
#Checking for missiing values
df.isna().sum().sort_values(ascending=False)

In [None]:
sns.countplot(df['red blood cells'])

# Random Value Imputation

In [None]:
#filling the missing values using Random values in the dataset

In [None]:
data=df.copy()

In [None]:
data.head()

In [None]:
data['red blood cells'].dropna().sample()#it choose the values randomly and get best value to fit over the line

In [None]:
data['red blood cells'].isnull().sum()#number of null values in Reb blood cell

In [None]:
data[data['red blood cells'].isnull()].index#index must equal

In [None]:
#filling all null values in RBC's randomly through dataset
random_sample=data['red blood cells'].dropna().sample(152)
random_sample

In [None]:
random_sample.index=data[data['red blood cells'].isnull()].index

In [None]:
random_sample.index

In [None]:
random_sample

In [None]:
data.loc[data['red blood cells'].isnull(),'red blood cells']=random_sample

In [None]:
data.head()

In [None]:
data['red blood cells'].isnull().sum()

As now the null values in Rbc is 0 so missing values have been removed

In [None]:
sns.countplot(data['red blood cells'])#it doesn't affect data distribution

In [None]:
def Random_value_Imputation(feature):
    random_sample=data[feature].dropna().sample(data[feature].isnull().sum())
    random_sample.index=data[data[feature].isnull()].index
    data.loc[data[feature].isnull(),feature]=random_sample

In [None]:
data[num_col].isnull().sum()#checkinig missing values on numerical values

In [None]:
for col in num_col:
    Random_value_Imputation(col)
    

In [None]:
data[num_col].isnull().sum()

In [None]:
data[cat_col].isnull().sum()#checkinig missing values on categorical values

In [None]:
Random_value_Imputation(' pus cell')

In [None]:
#As there are less number of missing value in other variables we can replace them by mode
data['pus cell clumps'].mode()[0]

In [None]:
#replacing every missing values by its mode
def impute_mode(feature):
    mode=data[feature].mode()[0]
    data[feature]=data[feature].fillna(mode)

In [None]:
for col in cat_col:
    impute_mode(col)

In [None]:
data[cat_col].isnull().sum()

# All the null values are replaced by there random variables or mode


# Feature Engineering


In [None]:
data.head()

# Feature Encoding

In [None]:
#indentifying the categories
for col in cat_col:
    print('{} has {} categories'.format(col,data[col].nunique()))

In [None]:
##Label Encoding

##normal--0
##abnormal--1

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()

In [None]:
for col in cat_col:
    data[col]=le.fit_transform(data[col])#Converting all string to numerical data type

In [None]:
data.head()

As you can see above data all categorical value have now changed to numerical 

In [None]:
#Selecting the best features for the model
#Checking the probability values that ar less than 0.5 using chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
#Independent Columns
ind_col=[col for col in data.columns if col!='class']
dep_col='class'#Dependent Columns

In [None]:
X=data[ind_col]
y=data[dep_col]

In [None]:
X.head()

In [None]:
y

In [None]:
ordered_rank_features=SelectKBest(score_func=chi2,k=20)
ordered_feature=ordered_rank_features.fit(X,y)

In [None]:
ordered_feature

In [None]:
ordered_feature.scores_

In [None]:
datascores=pd.DataFrame(ordered_feature.scores_,columns=['Score'])#making it more user friendly

In [None]:
datascores

In [None]:
dfcols=pd.DataFrame(X.columns)
dfcols

In [None]:
#Concatenation
features_rank=pd.concat([dfcols,datascores],axis=1)
features_rank

In [None]:
features_rank.columns=['features','Score']
features_rank

In [None]:
features_rank.nlargest(10,'Score')#TOP 10 FEATURES FOR BUILDING THE ML MODEL

In [None]:
selected_columns=features_rank.nlargest(10,'Score')['features'].values

In [None]:
selected_columns

This are selected columns that will help us to find that person is having ckd or not

In [None]:
X_new=data[selected_columns]

In [None]:
X_new.head()

In [None]:
len(X_new)

In [None]:
X_new.shape