# Data Preparation Notebook

***

## Dependencies

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt

from pandas import DataFrame
from pandas import Series
from numpy import ndarray

## Feature Selection

### Stoppiglia's Method

In [57]:
def cos_similarity(
    feature: Series,
    target: Series) -> ndarray:
    """docstring"""
    
    # Preparing inputs for calculation
    feature_vec = np.array(feature)
    target_vec = np.array(target)
    
    # Cossine_similarity using two vectors
    vecs_sqr_inner_prod = np.dot(
        feature_vec.T, target_vec) ** 2
    
    feature_mod = np.dot(feature_vec.T, feature_vec)
    target_mod = np.dot(target_vec.T, target_vec)
    
    mod_prod = feature_mod * target_mod
    similatiry = vecs_sqr_inner_prod / mod_prod
    
    return similatiry

In [58]:
def complement_projection(
    space_vec: ndarray, to_project: ndarray) -> ndarray:
    """docstring"""

    complement_vec = np.dot(space_vec, to_project) * space_vec
    projection = to_project - complement_vec
    return projection

In [94]:
def feature_selection(
    data: DataFrame, target: Series) -> DataFrame:
    """docstring"""
    
    working_data = data.copy()
    
    ord_features = []
    features = list(data.columns)
    
    while features:
        similarity = {
            col: cos_similarity(working_data[col], target)
            for col in features
        }
        
        sim_data = DataFrame.from_dict(similarity, orient='index', columns=['sim'])
        relevant = sim_data.sort_values('sim').iloc[-1].name
        ord_features.append(relevant)
        features.remove(relevant)
        
        for feature in features:
            working_data[feature] = complement_projection(
                working_data[relevant].values ,working_data[feature].values)
        
        target = Series(
            complement_projection(
                working_data[relevant].values, target.values)
        )
    
    return ord_features

In [101]:
raw_data = pd.read_csv('data/life-expectancy-data.csv')
data = raw_data.dropna()
print(f'dataset size: {len(clean_data.index)}')

target = clean_data['Life expectancy ']

# Removing non-feature columns
df = clean_data.drop(
    columns=['Country', 'Status', 'Life expectancy '])

dataset size: 1649


In [102]:
len(raw_data.index)

2938

In [96]:
feat_seq = feature_selection(df, target)
feat_seq

  vecs_sqr_inner_prod = np.dot(
  mod_prod = feature_mod * target_mod
  similatiry = vecs_sqr_inner_prod / mod_prod
  projection = to_project - complement_vec


['Year',
 'infant deaths',
 'Measles ',
 'Schooling',
 'Income composition of resources',
 ' thinness 5-9 years',
 ' thinness  1-19 years',
 'Population',
 'GDP',
 ' HIV/AIDS',
 'Diphtheria ',
 'Total expenditure',
 'Polio',
 'under-five deaths ',
 ' BMI ',
 'Hepatitis B',
 'percentage expenditure',
 'Alcohol',
 'Adult Mortality']

In [98]:
ord_data = df.copy()[feat_seq]
ord_data['Life expectancy '] = target
ord_data.to_csv("data/curated_data.csv", index=False)

In [99]:
n = pd.read_csv("data/curated_data.csv")
n.head(5)

Unnamed: 0,Year,infant deaths,Measles,Schooling,Income composition of resources,thinness 5-9 years,thinness 1-19 years,Population,GDP,HIV/AIDS,Diphtheria,Total expenditure,Polio,under-five deaths,BMI,Hepatitis B,percentage expenditure,Alcohol,Adult Mortality,Life expectancy
0,2015,62,1154,10.1,0.479,17.3,17.2,33736494.0,584.25921,0.1,65.0,8.16,6.0,83,19.1,65.0,71.279624,0.01,263.0,65.0
1,2014,64,492,10.0,0.476,17.5,17.5,327582.0,612.696514,0.1,62.0,8.18,58.0,86,18.6,62.0,73.523582,0.01,271.0,59.9
2,2013,66,430,9.9,0.47,17.7,17.7,31731688.0,631.744976,0.1,64.0,8.13,62.0,89,18.1,64.0,73.219243,0.01,268.0,59.9
3,2012,69,2787,9.8,0.463,18.0,17.9,3696958.0,669.959,0.1,67.0,8.52,67.0,93,17.6,67.0,78.184215,0.01,272.0,59.5
4,2011,71,3013,9.5,0.454,18.2,18.2,2978599.0,63.537231,0.1,68.0,7.87,68.0,97,17.2,68.0,7.097109,0.01,275.0,59.2
