In [36]:
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
import os.path


## File modification
# This aims to standardize the two files so they can be preprocessed in the same way.

def hf_add_definitions(input_file_name, output_file_name, definition) :
    input_file = open(input_file_name,"r")
    text = definition + "\n" + input_file.read()
    input_file.close()
    
    output_file = open(output_file_name, "w")
    output_file.write(text)
    output_file.close()
    
    
def hf_remove_characters(text, characters) :
    
    for character in characters : 
        text = text.replace(character,'')
    
    return text

def hf_remove_id(input_file_name) :
    input_file = open(input_file_name,"r")
    text = ""
    
    for line in input_file : 
        text += line[line.index(',') + 1:]
        
    input_file.close()
    
    return text

def hf_prepare_kidney_file(input_file_name, output_file_name) :
    text = hf_remove_id(input_file_name) #This removes the id column
    text = hf_remove_characters(text, ["\t", "?", ' ']) #This removes tabs, question marks, and spaces from the file
    
    output_file = open(output_file_name, "w")
    output_file.write(text)
    output_file.close()

unprepared_banknote_file_name = "data_banknote_authentication.txt"
banknote_file_name = "data_banknote_authentication_with_def.csv"
if not(os.path.exists(banknote_file_name)) :
    hf_add_definitions(unprepared_banknote_file_name, banknote_file_name, "variance,skewness,curtosis,entropy,class") #This adds a definition to the file

unprepared_kidney_file_name = "archive/kidney_disease.csv"
kidney_file_name = "archive/kidney_disease_cleaned.csv"
if not(os.path.exists(kidney_file_name)) :
    hf_prepare_kidney_file(unprepared_kidney_file_name, kidney_file_name) #This removes tabs, question marks, and spaces from the file
    

## Import the files into pandas dataframes

def import_file(file_name, separator) :    
    pd_data = pd.read_csv(file_name, sep = separator)
    return pd_data
    
banknote_pd_data = import_file(banknote_file_name, ",")    
kidney_pd_data = import_file(kidney_file_name, ",")


## Clean dataframes
# Add missing values
# Center and reduce columns

def hf_get_mean_value(column) :
    #Help function that returns the mean value of the column.
    #For non numeric data types, returns the most frequent value.
    if is_numeric_dtype(column) :
        return column.mean()
    else :
        values = pd.value_counts(column)
        return values.idxmax()
        
def drop_column(pd_data):  #Alina CIOCARLAN
    c = pd_data.columns
    empty = (pd_data.isnull().sum() / len(pd_data)) * 100

    for i in range(len(empty)):
        if empty[i] > 30: #drop column if percentage of missing values is above 30%
            c_to_drop = c[i]
            pd_data = pd_data.drop(c_to_drop, 1)
    return pd_data

        
def clean_dataframe(pd_data) :
    pd_data = drop_column(pd_data)
    
    column_names = pd_data.columns
    number_of_columns = column_names.size
    means = []
    
    for col_name in column_names :
        means.append(hf_get_mean_value(pd_data[col_name]))
       
    # We have the means of each column of the dataset
    # Now we find the cells that are not filled, and replace them with the mean value of the column.
    null_data = np.where(pd.isnull(pd_data))
    for i in range(len(null_data[0])) :
        row = null_data[0][i]
        col_id = null_data[1][i]
        col = column_names[col_id]
        pd_data.at[row,col] = means[col_id]
    
    # Our cells are all filled now.
    # We can center and reduce the values of the numeric columns
    L=list(pd_data.columns)[:-1] #function applied on all numerical values except the last class
    pd_data[L] = pd_data[L].apply(lambda x : (x - x.mean()) / np.sqrt(x.var() + 10**-9) if is_numeric_dtype(x) else x) #The value 10**-9 is a safety to ensure we don't divide by 0.
    
    return pd_data

kidney_pd_data = clean_dataframe(kidney_pd_data)
banknote_pd_data = clean_dataframe(banknote_pd_data)



In [37]:
print(kidney_pd_data.head())
print(kidney_pd_data['classification'])

        age        bp        sg        al        su        pc         pcc  \
0 -0.205207  0.262010  0.482742 -0.013321 -0.437249    normal  notpresent   
1 -2.620528 -1.964120  0.482742  2.344580 -0.437249    normal  notpresent   
2  0.619537  0.262010 -1.379639  0.772646  2.476824    normal  notpresent   
3 -0.205207 -0.480033 -2.310829  2.344580 -0.437249  abnormal     present   
4 -0.028476  0.262010 -1.379639  0.772646 -0.437249    normal  notpresent   

           ba       bgr        bu  ...      hemo       pcv        wc  htn  \
0  notpresent -0.361535 -0.434723  ...  1.057946  0.627586 -0.240218  yes   
1  notpresent  0.000000 -0.799939  ... -0.451531 -0.108513 -0.953592   no   
2  notpresent  3.676836 -0.089797  ... -1.077412 -0.967295 -0.359114   no   
3  notpresent -0.415023 -0.028928  ... -0.488348 -0.844612 -0.676169  yes   
4  notpresent -0.562116 -0.637621  ... -0.341082 -0.476562 -0.438377   no   

    dm  cad appet   pe  ane classification  
0  yes   no  good   no   no  

Il y a beaucoup trop de colonnes donc il va falloir faire une PCA. Et il faut aussi transformer tous les features categorical en discrete.

In [38]:
print(banknote_pd_data.head())

   variance  skewness  curtosis   entropy  class
0  1.121397  1.149036 -0.975614  0.354432      0
1  1.446538  1.064065 -0.894710 -0.128721      0
2  1.207369 -0.777069  0.122174  0.617848      0
3  1.063355  1.295005 -1.254940 -1.143612      0
4 -0.036758 -1.086642  0.736462  0.096552      0


Convert categorical data to discrete data

In [54]:
from sklearn.preprocessing import LabelEncoder


def label_encoding(pd_df): #Alina CIOCARLAN
    
    c=pd_df.columns
    types=list(pd_df.dtypes)
    for i in range(len(c)):
        if types[i] == 'O': #there are only 2 types here, if it's of type 'O', we have to transform it into numerical
            lab_encod = LabelEncoder()
            pd_df[c[i]] = lab_encod.fit_transform(pd_df[c[i]])
         
label_encoding(kidney_pd_data)

In [56]:
print(kidney_pd_data.head())
print(kidney_pd_data['classification'])

        age        bp        sg        al        su  pc  pcc  ba       bgr  \
0 -0.205207  0.262010  0.482742 -0.013321 -0.437249   1    0   0 -0.361535   
1 -2.620528 -1.964120  0.482742  2.344580 -0.437249   1    0   0  0.000000   
2  0.619537  0.262010 -1.379639  0.772646  2.476824   1    0   0  3.676836   
3 -0.205207 -0.480033 -2.310829  2.344580 -0.437249   0    1   0 -0.415023   
4 -0.028476  0.262010 -1.379639  0.772646 -0.437249   1    0   0 -0.562116   

         bu  ...      hemo       pcv        wc  htn  dm  cad  appet  pe  ane  \
0 -0.434723  ...  1.057946  0.627586 -0.240218    1   1    0      0   0    0   
1 -0.799939  ... -0.451531 -0.108513 -0.953592    0   0    0      0   0    0   
2 -0.089797  ... -1.077412 -0.967295 -0.359114    0   1    0      1   0    1   
3 -0.028928  ... -0.488348 -0.844612 -0.676169    1   0    0      1   1    1   
4 -0.637621  ... -0.341082 -0.476562 -0.438377    0   0    0      0   0    0   

   classification  
0               0  
1         

In [93]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def PCA_on_df(df,seuil): #Alina CIOCARLAN
    pca=PCA()
    pca.fit(df)
    df=pca.transform(df)
    cumulative_pca=pca.explained_variance_ratio_.cumsum()
    n=0
    for e in cumulative_pca:
        if e<seuil:
            n+=1
    df = df[:, :n]
    return pd.DataFrame(df)

In [94]:
L=list(kidney_pd_data.columns)[:-1]
df_proj=PCA_on_df(kidney_pd_data[L],0.9)
print(df_proj.head())

          0         1         2         3         4         5         6  \
0 -1.118114  0.018732  0.210450  0.197741  0.283720 -0.121380  0.154483   
1 -0.667419  0.637700 -1.118999 -0.630814 -2.931895 -1.916005 -0.923634   
2  3.027171 -3.132784  1.028278  0.792919 -0.522101 -1.306855 -0.842645   
3  2.629734  1.229987 -1.975479  1.207339 -1.001311 -1.016230 -1.266714   
4  0.311846  0.112979 -0.897809 -0.487577  0.198846 -0.651995 -0.498655   

          7         8         9  
0 -0.029340  0.692544  0.214566  
1 -0.827027  0.950454  0.624493  
2 -0.119248 -0.908876  0.017875  
3  1.286312  1.808054  0.442168  
4  1.312279  0.514110  0.087847  


In [97]:
L=list(banknote_pd_data.columns)[:-1]
df_proj_bis=PCA_on_df(banknote_pd_data[L],0.97)
print(df_proj_bis.head())

['variance', 'skewness', 'curtosis', 'entropy']
          0         1         2
0 -1.472268 -1.163492 -0.167193
1 -1.638843 -1.093413  0.371052
2  0.513732 -1.324397  0.508926
3 -2.310110 -0.202441  0.540584
4  1.192981  0.024645  0.421036


In [106]:
from sklearn.model_selection import train_test_split

# split dataset kidney disease
dataX=df_proj
dataY=kidney_pd_data['classification']
train_ratio = 0.75

# we split the dataset in 2 : one part for training/validation set (we'll do K-fold validation right after),
# the other for test set
def data_split(dataX,dataY,train_ratio,validation_ratio,test_ratio): #Alina CIOCARLAN
    
    x_train, x_test, y_train, y_test = train_test_split(dataX,dataY , test_size=1 - train_ratio)
 
    return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test=data_split(dataX,dataY,train_ratio)

kf = KFold(n_splits=10) #10 is a common number on large datasets (validate on 10% on dataset each time)
# we now have the index of the train/validation test (through kf.split(X))
# we just have to train our model on those splits

(40, 10)
