In [81]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
import os 



%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [82]:
data_original = pd.read_csv('ElectionsData.csv', header=0)
data_original.head()

Unnamed: 0,Vote,Occupation_Satisfaction,Avg_monthly_expense_when_under_age_21,Avg_lottary_expanses,Age_group,Avg_monthly_expense_on_pets_or_plants,Looking_at_poles_results,Avg_environmental_importance,Married,Gender,...,Last_school_grades,Most_Important_Issue,Number_of_differnt_parties_voted_for,Political_interest_Total_Score,Number_of_valued_Kneset_members,Main_transportation,Occupation,Financial_agenda_matters,Num_of_kids_born_last_10_years,Overall_happiness_score
0,Violets,1.0,113.445801,1688.51019,Below_30,1477.809692,Yes,202.042787,Yes,Male,...,60.0,Financial,12.0,683.76044,1.0,Public_or_other,Industry_or_other,No,1.0,392.803101
1,Browns,2.0,161.720841,696.069582,Below_30,2969.865888,Yes,88.9423,Yes,Male,...,60.0,Financial,9.0,793.017823,2.0,Car,Industry_or_other,No,0.0,41.028616
2,Greens,6.0,524.947117,50048.85388,30-45,30803.81297,No,789.796962,Yes,Female,...,80.0,Healthcare,10.0,253.102383,1.0,Motorcycle_or_truck,Services_or_Retail,No,0.0,635.000942
3,Whites,1.0,521.454363,50265.18536,Below_30,30396.55725,No,565.727946,Yes,Male,...,,Social,8.0,278.904297,1.0,Car,Services_or_Retail,No,0.0,502.278182
4,Browns,6.0,210.879361,7793.195795,30-45,5021.415879,No,423.589896,Yes,Female,...,70.0,Social,11.0,605.564038,4.0,Car,Public_Sector,No,2.0,40.469037


In [83]:
data = data_original.copy()

# Converting categorial columns to int (section 2)
def convert_to_categorial(data: pd.DataFrame) -> pd.DataFrame:
    ObjFeat = data.keys()[data.dtypes.map(lambda x: x == 'object')] # picks all features type object
    for f in ObjFeat:
        data[f] = data[f].astype("category")
        data[f+"Int"] = data[f].cat.rename_categories(range(data[f].nunique())).astype('Int64')
        data.loc[data[f].isnull(), f+"Int"] = np.nan #fix NaN conversion
        data[f] = data[f+"Int"]
        data = data.drop(f+"Int", axis=1) # remove temporary columns
    return data

data = convert_to_categorial(data)
data.dtypes

Vote                                               Int64
Occupation_Satisfaction                          float64
Avg_monthly_expense_when_under_age_21            float64
Avg_lottary_expanses                             float64
Age_group                                          Int64
Avg_monthly_expense_on_pets_or_plants            float64
Looking_at_poles_results                           Int64
Avg_environmental_importance                     float64
Married                                            Int64
Gender                                             Int64
Voting_Time                                        Int64
Financial_balance_score_(0-1)                    float64
%Of_Household_Income                             float64
Yearly_IncomeK                                   float64
Avg_size_per_room                                float64
Garden_sqr_meter_per_person_in_residancy_area    float64
Avg_Residancy_Altitude                           float64
Yearly_ExpensesK               

In [84]:
# Cleansing

# removing garbage values

def remove_negative(data: pd.DataFrame) -> pd.DataFrame:
    numeric_feat = data.keys()[data.dtypes.map(lambda x: x == 'float64')]
    data[numeric_feat] = data[numeric_feat].mask(data[numeric_feat] < 0)
    return data

# Outlier removing

def remove_outlier(data: pd.DataFrame, z_threshold) -> pd.DataFrame:
    numeric_feat = data.keys()[data.dtypes.map(lambda x: x == 'float64')]
    mean = data[numeric_feat].mean()
    std = data[numeric_feat].std()
    z_scores = (data[numeric_feat] - mean) / std
    data[numeric_feat] = data[numeric_feat].mask(abs(z_scores) > z_threshold)
    return data

data = remove_negative(data)
data = remove_outlier(data, 3)
data

Unnamed: 0,Vote,Occupation_Satisfaction,Avg_monthly_expense_when_under_age_21,Avg_lottary_expanses,Age_group,Avg_monthly_expense_on_pets_or_plants,Looking_at_poles_results,Avg_environmental_importance,Married,Gender,...,Last_school_grades,Most_Important_Issue,Number_of_differnt_parties_voted_for,Political_interest_Total_Score,Number_of_valued_Kneset_members,Main_transportation,Occupation,Financial_agenda_matters,Num_of_kids_born_last_10_years,Overall_happiness_score
0,10,1.0,113.445801,1688.510190,2,1477.809692,1,202.042787,1,1,...,60.0,2,12.0,683.760440,1.0,3,1,0,1.0,392.803101
1,1,2.0,161.720841,696.069582,2,2969.865888,1,88.942300,1,1,...,60.0,2,9.0,793.017823,2.0,0,1,0,0.0,41.028616
2,2,6.0,524.947117,50048.853880,0,30803.812970,0,789.796962,1,0,...,80.0,4,10.0,253.102383,1.0,2,3,0,0.0,635.000942
3,11,1.0,521.454363,50265.185360,2,30396.557250,0,565.727946,1,1,...,,7,8.0,278.904297,1.0,0,3,0,0.0,502.278182
4,1,6.0,210.879361,7793.195795,0,5021.415879,0,423.589896,1,0,...,70.0,7,11.0,605.564038,4.0,0,2,0,2.0,40.469037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,7,9.0,124.023053,26638.227290,0,1760.420762,0,1060.517225,0,0,...,80.0,4,10.0,321.858827,1.0,2,2,1,3.0,646.016048
9996,0,1.0,1146.385393,245768.367300,0,146414.291700,0,826.909059,0,0,...,100.0,0,16.0,409.212213,1.0,3,0,0,1.0,198.653823
9997,9,7.0,173.952313,20598.709440,1,3430.140470,0,690.466486,1,0,...,90.0,1,12.0,152.322399,3.0,1,1,1,0.0,494.710087
9998,5,10.0,307.277614,41875.937280,1,10603.484990,0,547.345514,1,0,...,30.0,5,5.0,286.161536,2.0,2,1,1,0.0,237.762825


In [86]:

#Imputation

# filling using median for numeric values and most common for nominal values
def imputation(data: pd.DataFrame) -> pd.DataFrame:
    # Median - numeric
    numeric_feat = data.keys()[data.dtypes.map(lambda x: x == 'float64')]
    data[numeric_feat] = data[numeric_feat].fillna(data[numeric_feat].dropna().median())

    # Most common - nominal
    nominal_feat = data.keys()[data.dtypes.map(lambda x: x == 'Int64')]
    data[nominal_feat] = data[nominal_feat].fillna(data[nominal_feat].dropna().mode().iloc[0])
    return data 

print(data.isnull().values.sum())
data = imputation(data)
data.isnull().values.sum()

7728


  # Remove the CWD from sys.path while we load stuff.


0