<a href="https://colab.research.google.com/github/alonziv1/Machine-Learning/blob/main/prepare_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import**

In [None]:
import pandas as pd
import numpy as np
import math
from numpy import nan
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

#**Main**

In [None]:
def prepare_data(data, training_data):

    raw_data = data.copy()
    raw_training_data = training_data.copy()

    raw_data.reset_index(inplace = True)
    raw_training_data.reset_index(inplace = True)

    raw_data = select_features(raw_data)
    raw_training_data = select_features(raw_training_data)

    prepared_data = transform_features(raw_data)
    prepared_training_data = transform_features(raw_training_data)
   
    prepared_data, prepared_training_data = mean_imputate_features(prepared_data, prepared_training_data)
    prepared_data, prepared_training_data = median_imputate_features(prepared_data, prepared_training_data)
    prepared_data, prepared_training_data = most_freq_imputate_features(prepared_data, prepared_training_data)

    prepared_training_data = select_features_after(prepared_training_data)
    prepared_data = select_features_after(prepared_data)

    prepared_data = normalize_features(prepared_data, prepared_training_data)

    prepared_data = prepared_data[sorted(prepared_data.columns)]

    return prepared_data


#**select_features**

according to our analysis we chose a subgroup of the features.

In [None]:
def select_features(data):
  _data = data[['PCR_01','PCR_07','PCR_04','PCR_08','PCR_10', 'PCR_05','sport_activity','sugar_levels', 'symptoms','blood_type','sex','covid','spread','risk']]
  return _data

In [None]:
def select_features_after(data):
  return data.drop(columns = ['low_appetite', 'sex', 'A-', 'AB+', 'B+', 'B-', 'O+', 'O-'])

#**transform_features**

In [None]:
def transform_features(data):

  string_to_numeric(data)
  data = one_hot_encoding(data)
  unique_symptoms = get_symptoms(data)
  data = add_symptoms_features(data, unique_symptoms)
  string_to_numeric(data)

  return data

##**string_to_numeric**

In [None]:
def string_to_numeric(data):
  data.replace({"High": 1, "Low": 0}, inplace=True)
  data.replace({"F": 1, "M": 0}, inplace=True)
  data.replace({True: 1, False: 0}, inplace=True)
  

##**one_hot_encoding**

In [None]:
def one_hot_encoding(data):
  
  blood_type_num = pd.get_dummies(data['blood_type'])
  joined_data = data.join(blood_type_num)
  joined_data.drop(['blood_type'], axis = 1, inplace = True)

  return joined_data

##**get_symptoms**

In [None]:
def get_symptoms(data):
  symptoms_list = data['symptoms'].unique()
  unique_symptoms = []
  for i in symptoms_list:
    if (type(i) is str):
      unique_symptoms.append(i.split(";")) 

  unique_symptoms = list(itertools.chain.from_iterable(unique_symptoms))
  unique_symptoms = pd.Series(unique_symptoms)
  unique_symptoms = unique_symptoms.unique()

  return unique_symptoms

##**add_symptoms_features**

In [None]:
def add_symptoms_features(data, unique_symptoms):
  
  symptoms_df = pd.DataFrame(index=range(data.shape[0]))
  for symptom in unique_symptoms:
    symptoms_df[symptom] = np.nan
  symptoms_df.fillna(0, inplace=True)
  symptoms_df[np.isnan(symptoms_df)] = 0

  
  joined_data = data.join(symptoms_df)

  for index in joined_data.index:
    if(type(joined_data['symptoms'][index]) is not str):
      continue
    for symptom in unique_symptoms:
      if (symptom in joined_data['symptoms'][index]):
        joined_data[symptom][index] = 1

  joined_data.drop(['symptoms'], axis = 1, inplace = True)
  
  return joined_data     

##**imputate_features**

In [None]:
def mean_imputate_features(data, training_data):
  
    mean_features = ['PCR_01','PCR_07','PCR_04','PCR_05', 'PCR_08', 'PCR_10', 'sugar_levels','sport_activity','shortness_of_breath','sore_throat']
    imputer1 = SimpleImputer(missing_values=np.nan, strategy='mean')
    training_data[mean_features] = imputer1.fit_transform(training_data[mean_features])
    data[mean_features] = imputer1.transform(data[mean_features])

    return data, training_data

In [None]:
def median_imputate_features(data, training_data):
  
    features = ['sport_activity']
    imputer1 = SimpleImputer(missing_values=np.nan, strategy='median')
    training_data[features] = imputer1.fit_transform(training_data[features])
    data[features] = imputer1.transform(data[features])

    return data, training_data

In [None]:
def most_freq_imputate_features(data, training_data):

    imputer2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    training_data[['sex', 'A+']] = imputer2.fit_transform(training_data[['sex', 'A+']])
    data[['sex', 'A+']] = imputer2.transform(data[['sex', 'A+']])

    return data, training_data

##**normalize_features**

we are using min - max scaling, since it performed better then standard scaling for all features.

In [None]:
def normalize_features(data, training_data):
  
  from sklearn import preprocessing

  scaler = preprocessing.MinMaxScaler().fit(training_data)

  scaled_data = scaler.transform(data)

  data.loc[:,:] = scaled_data

  return data
