# Feature Extraction

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import r2_score

In [None]:
df = pd.read_pickle('data/happiness_data.pkl')

In [None]:
years_train = [2015, 2016]
years_test = [2017]
train_features = df.loc[df['Year'].isin(years_train)]
test_features = df.loc[df['Year'].isin(years_test)]
train_features.reset_index(inplace=True, drop=True)
test_features.reset_index(inplace=True, drop=True)
df_train = train_features.drop(columns=['Country','Year','Score','Low','High','Economy','Family','Health','Freedom','Trust','Generosity','Dystopia'],axis=1)
df_test = test_features.drop(columns=['Country','Year','Score','Low','High','Economy','Family','Health','Freedom','Trust','Generosity','Dystopia'],axis=1)

In [None]:
#Fetching training and test data set
score_train = pd.DataFrame(train_features['Score'])
economy_train = pd.DataFrame(train_features['Economy'])
family_train = pd.DataFrame(train_features['Family'])
health_train = pd.DataFrame(train_features['Health'])
freedom_train = pd.DataFrame(train_features['Freedom'])
trust_train = pd.DataFrame(train_features['Trust'])
generosity_train = pd.DataFrame(train_features['Generosity'])
dystopia_train = pd.DataFrame(train_features['Dystopia'])

score_test = pd.DataFrame(test_features['Score'])
economy_test = pd.DataFrame(test_features['Economy'])
family_test = pd.DataFrame(test_features['Family'])
health_test = pd.DataFrame(test_features['Health'])
freedom_test = pd.DataFrame(test_features['Freedom'])
trust_test = pd.DataFrame(test_features['Trust'])
generosity_test = pd.DataFrame(test_features['Generosity'])
dystopia_test = pd.DataFrame(test_features['Dystopia'])

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

rescaled_df = scaler.fit_transform(df_train)
df_train = pd.DataFrame(rescaled_df, columns=df_train.columns)

rescaled_df = scaler.fit_transform(df_test)
df_test = pd.DataFrame(rescaled_df, columns=df_test.columns)

In [None]:
from sklearn.feature_selection import RFE
from sklearn import linear_model

In [None]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, economy_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 8','Ind 20','Ind 19','Ind 23','Ind 16','Ind 6','Ind 30','Ind 14','Ind 29','Ind 27','Ind 21','Ind 31']
df_economy_train = df_train[col]
df_economy_test = df_test[col]

df_economy_train.to_pickle('data/economy_train.pkl')
df_economy_test.to_pickle('data/economy_test.pkl')
#pd.DataFrame(fit.ranking_, index = range(1,34)).sort_values(by=[0])

In [None]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, family_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 8','Ind 20','Ind 19','Ind 23','Ind 6','Ind 16','Ind 30']
df_family_train = df_train[col]
df_family_test = df_test[col]

df_family_train.to_pickle('data/family_train.pkl')
df_family_test.to_pickle('data/family_test.pkl')
#pd.DataFrame(fit.ranking_, index = range(1,34)).sort_values(by=[0])

In [None]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, health_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 23', 'Ind 7', 'Ind 24', 'Ind 18', 'Ind 5', 'Ind 15', 'Ind 20', 'Ind 25', 'Ind 33', 'Ind 6']
df_health_train = df_train[col]
df_health_test = df_test[col]

df_health_train.to_pickle('data/health_train.pkl')
df_health_test.to_pickle('data/health_test.pkl')
#pd.DataFrame(fit.ranking_, index = range(1,34)).sort_values(by=[0])

In [None]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, freedom_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 3', 'Ind 8', 'Ind 32', 'Ind 7', 'Ind 32', 'Ind 24', 'Ind 6']
df_freedom_train = df_train[col]
df_freedom_test = df_test[col]

df_freedom_train.to_pickle('data/freedom_train.pkl')
df_freedom_test.to_pickle('data/freedom_test.pkl')
#pd.DataFrame(fit.ranking_, index = range(1,34)).sort_values(by=[0])

In [None]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, trust_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 20', 'Ind 13', 'Ind 17', 'Ind 14', 'Ind 25', 'Ind 22']
df_trust_train = df_train[col]
df_trust_test = df_test[col]

df_trust_train.to_pickle('data/trust_train.pkl')
df_trust_test.to_pickle('data/trust_test.pkl')
#pd.DataFrame(fit.ranking_, index =  range(1,34)).sort_values(by=[0])

In [None]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, generosity_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 32', 'Ind 3', 'Ind 28', 'Ind 13', 'Ind 6', 'Ind 5', 'Ind 26', 'Ind 32']
df_generosity_train = df_train[col]
df_generosity_test = df_test[col]

df_generosity_train.to_pickle('data/generosity_train.pkl')
df_generosity_test.to_pickle('data/generosity_test.pkl')
#pd.DataFrame(fit.ranking_, index = range(1,34)).sort_values(by=[0])

In [None]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, dystopia_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 8','Ind 23','Ind 6','Ind 29','Ind 27','Ind 31']
df_dystopia_train = df_train[col]
df_dystopia_test = df_test[col]

df_dystopia_train.to_pickle('data/dystopia_train.pkl')
df_dystopia_test.to_pickle('data/dystopia_test.pkl')
#pd.DataFrame(fit.ranking_, index = range(33)).sort_values(by=[0])

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
fit = pca.fit(df_test)
print("Explained Variance: ", fit.explained_variance_ratio_)
print("Fit Components: ", fit.components_)