In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
import os 
import sklearn
from data_preperation import *


%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
data_original = pd.read_csv('ElectionsData.csv', header=0)
data_original.head()

In [None]:
discrete_features = ['Occupation_Satisfaction', 'Last_school_grades', 'Number_of_differnt_parties_voted_for', 'Number_of_valued_Kneset_members', 'Num_of_kids_born_last_10_years']
continuous_features = ['Avg_monthly_expense_when_under_age_21', 'Avg_lottary_expanses', 'Avg_monthly_expense_on_pets_or_plants', 'Avg_environmental_importance',
                      'Financial_balance_score_(0-1)', '%Of_Household_Income', 'Yearly_IncomeK', 'Avg_size_per_room', 'Garden_sqr_meter_per_person_in_residancy_area', 'Avg_Residancy_Altitude',
                      'Yearly_ExpensesK', '%Time_invested_in_work', 'Avg_education_importance', 'Avg_Satisfaction_with_previous_vote', 
                      'Avg_monthly_household_cost', 'Phone_minutes_10_years', 'Avg_government_satisfaction', 'Weighted_education_rank', '%_satisfaction_financial_policy',
                      'Avg_monthly_income_all_years', 'Political_interest_Total_Score', 'Overall_happiness_score']
nominal_features = ['Vote', 'Will_vote_only_large_party', 'Age_group', 'Voting_Time']
onehot_nominal_features = ['Most_Important_Issue', 'Main_transportation', 'Occupation']
binary_features = ['Gender', 'Looking_at_poles_results', 'Married', 'Financial_agenda_matters']

numerical_features = discrete_features + continuous_features
total_nominal_features = nominal_features + onehot_nominal_features + binary_features

In [None]:
uniform_features = ['Occupation_Satisfaction', 'Financial_balance_score_(0-1)',
                    '%Of_Household_Income', 'Yearly_IncomeK', 'Avg_government_satisfaction',
                    '%_satisfaction_financial_policy', 'Garden_sqr_meter_per_person_in_residancy_area',
                    'Yearly_ExpensesK', '%Time_invested_in_work']
normal_features = ['Number_of_differnt_parties_voted_for', 'Number_of_valued_Kneset_members',
                   'Avg_environmental_importance',
                   'Avg_education_importance', 'Avg_monthly_household_cost', 'Weighted_education_rank',
                   'Overall_happiness_score', 'Avg_size_per_room', 'Avg_Residancy_Altitude']
unknown_features = ['Last_school_grades', 'Num_of_kids_born_last_10_years',
                    'Avg_monthly_expense_when_under_age_21', 'Avg_lottary_expanses',
                    'Avg_monthly_expense_on_pets_or_plants', 'Avg_Satisfaction_with_previous_vote',
                    'Phone_minutes_10_years', 'Avg_monthly_income_all_years', 'Political_interest_Total_Score']

features_with_negative = {'Avg_monthly_expense_when_under_age_21': 147, 'Avg_lottary_expanses': 152 ,
                          'Avg_monthly_income_all_years': 152}

In [None]:
data = data_original.copy()



In [None]:
data = convert_to_onehot(data, onehot_nominal_features)
data

In [None]:
data = convert_to_categorical(data)
data

In [None]:
# Looks like the negative data is simillar to the distributaion of the data so maybe we need to negate it 
# instead of removing it
for feature in features_with_negative.keys():
    plt.figure();
    data[feature].hist()
    plt.title(feature)
    plt.figure();
    data.loc[:,feature][data[feature] < 0] = data.loc[:,feature][data[feature] < 0].abs()
    data[feature].hist()
    plt.title('After negating the negative values')
#data.loc[:, features_with_negative.keys()][data.loc[:, features_with_negative.keys()] < 0].hist(figsize=(16,16))
# Notice - this does not change the corr between the features

In [None]:
# Split
X_train, y_train, X_val, y_val, X_test, y_test = split_data(data, test_size=0.15, val_size=0.15)

In [None]:
XY_train = insert_label_to_data(X_train, y_train)


print(f'Number of negative values: {(XY_train.dropna().values < 0).sum()}')
XY_train = remove_negative(XY_train) # TODO check if removing is good or maybe abs
print(f'Number of negative values: {(XY_train.dropna().values < 0).sum()}')

print(f'Number of nan before: {XY_train.isnull().values.sum()}')
outlier = Outlier(XY_train)
XY_train = outlier.remove_outlier(XY_train, 3)
print(f'Number of nan after: {XY_train.isnull().values.sum()}')


In [None]:

print(f'Number of nan before: {XY_train.isnull().values.sum()}')
imputation = Imputation(XY_train)
XY_train = imputation.impute_train(XY_train)
print(f'Number of nan after: {XY_train.isnull().values.sum()}')        


In [None]:
print(f'Number of nan before: {X_test.isnull().values.sum() + X_val.isnull().values.sum()}')
X_test, X_val = imputation.impute_test_val(X_test, X_val)
print(f'Number of nan after: {X_test.isnull().values.sum() + X_val.isnull().values.sum()}')  

In [None]:
# using pearson correlation to find correlation between all features (this finds only linear relations)
import seaborn as sns
plt.figure(figsize=(50, 50))
cor = XY_train.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
cor = cor[abs(cor)>0.2]
cor.to_csv('correlation_matrix.csv')

In [None]:
from test import test_accuracy
X_train, y_train = split_label_from_data(XY_train)
print(test_accuracy(X_train, y_train, X_val, y_val))

In [None]:
#Test for SFS algorithem
from feature_selection import SFS
from sklearn.ensemble import RandomForestClassifier 

forest = RandomForestClassifier(n_estimators = 3)

selected_features = SFS(forest, X_train,y_train,X_val,y_val) 
selected_features