# Feature Extraction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import r2_score

In [262]:
df = pd.read_pickle('data/happiness_data.pkl')

In [3]:
years_train = [2015, 2016]
years_test = [2017]
train_features = df.loc[df['Year'].isin(years_train)]
test_features = df.loc[df['Year'].isin(years_test)]
train_features.reset_index(inplace=True, drop=True)
test_features.reset_index(inplace=True, drop=True)
df_train = train_features.drop(columns=['Country','Year','Score','Low','High','Economy','Family','Health','Freedom','Trust','Generosity','Dystopia'],axis=1)
df_test = test_features.drop(columns=['Country','Year','Score','Low','High','Economy','Family','Health','Freedom','Trust','Generosity','Dystopia'],axis=1)

In [4]:
#Fetching training and test data set
score_train = pd.DataFrame(train_features['Score'])
economy_train = pd.DataFrame(train_features['Economy'])
family_train = pd.DataFrame(train_features['Family'])
health_train = pd.DataFrame(train_features['Health'])
freedom_train = pd.DataFrame(train_features['Freedom'])
trust_train = pd.DataFrame(train_features['Trust'])
generosity_train = pd.DataFrame(train_features['Generosity'])
dystopia_train = pd.DataFrame(train_features['Dystopia'])

score_test = pd.DataFrame(test_features['Score'])
economy_test = pd.DataFrame(test_features['Economy'])
family_test = pd.DataFrame(test_features['Family'])
health_test = pd.DataFrame(test_features['Health'])
freedom_test = pd.DataFrame(test_features['Freedom'])
trust_test = pd.DataFrame(test_features['Trust'])
generosity_test = pd.DataFrame(test_features['Generosity'])
dystopia_test = pd.DataFrame(test_features['Dystopia'])

In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))

rescaled_df = scaler.fit_transform(df_train)
df_train = pd.DataFrame(rescaled_df, columns=df_train.columns)

rescaled_df = scaler.fit_transform(df_test)
df_test = pd.DataFrame(rescaled_df, columns=df_test.columns)

In [6]:
from sklearn.feature_selection import RFE
from sklearn import linear_model

In [68]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, economy_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 8','Ind 20','Ind 19','Ind 23','Ind 16','Ind 6','Ind 30','Ind 14','Ind 29','Ind 27','Ind 21','Ind 31']
df_economy_train = df_train[col]
df_economy_test = df_test[col]

lr = linear_model.LinearRegression()
lr.fit(df_economy_train, economy_train.values.ravel())
economy_predictions = lr.predict(df_economy_test)
errors = abs(economy_predictions - economy_test.values.ravel())
accuracy = r2_score(economy_test.values.ravel(), economy_predictions)
print('R2 score: ', accuracy)

df_economy_train.to_pickle('data/economy_train.pkl')
df_economy_test.to_pickle('data/economy_test.pkl')
#pd.DataFrame(fit.ranking_, index = range(1,34)).sort_values(by=[0])

Num Features:  1
Selected Features:  [False False  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False]
Feature Ranking:  [ 7 18  1  2 33 19  6  3 11 10  9  8 16 23 28 17 20 13  5  4 27 25 14 15
 12 29 26 21 24 22 30 32 31]
R2 score:  0.7757196637300185


In [19]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, family_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 8','Ind 20','Ind 19','Ind 23','Ind 6','Ind 16','Ind 30']
df_family_train = df_train[col]
df_family_test = df_test[col]

lr = linear_model.LinearRegression()
lr.fit(df_family_train, family_train.values.ravel())
family_predictions = lr.predict(df_family_test)
errors = abs(family_predictions - family_test.values.ravel())
accuracy = r2_score(family_test.values.ravel(), family_predictions)
print('R2 score: ', accuracy)

df_family_train.to_pickle('data/family_train.pkl')
df_family_test.to_pickle('data/family_test.pkl')
#pd.DataFrame(fit.ranking_, index = range(1,34)).sort_values(by=[0])

Num Features:  1
Selected Features:  [False False False False False False False False False False False  True
 False False False False False False False False False False False False
 False False False False False False False False False]
Feature Ranking:  [30  8 10 11 15 18 13 12 14  3  2  1 32 22 23 31 16 26  9  5 24 17  4 25
  6 20 28 27  7 29 19 21 33]
R2 score:  -0.09689154472440009


In [74]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, health_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 23', 'Ind 7', 'Ind 24', 'Ind 18', 'Ind 5', 'Ind 15', 'Ind 20', 'Ind 25', 'Ind 33', 'Ind 6']
df_health_train = df_train[col]
df_health_test = df_test[col]

lr = linear_model.LinearRegression()
lr.fit(df_health_train, health_train.values.ravel())
health_predictions = lr.predict(df_health_test)
errors = abs(health_predictions - health_test.values.ravel())
accuracy = r2_score(health_test.values.ravel(), health_predictions)
print('R2 score: ', accuracy)

df_health_train.to_pickle('data/health_train.pkl')
df_health_test.to_pickle('data/health_test.pkl')
#pd.DataFrame(fit.ranking_, index = range(1,34)).sort_values(by=[0])

Num Features:  1
Selected Features:  [False False False False False False False False False False False False
 False False False False False False False False False False  True False
 False False False False False False False False False]
Feature Ranking:  [ 5 16 14 15 20 33  3  2 10  9  8  7  4 32 24 26 21 18 11 28 27 31  1 17
 29 12 22 13 25 23 19  6 30]
R2 score:  0.8543852946016751


In [125]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, freedom_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 3', 'Ind 8', 'Ind 32', 'Ind 7', 'Ind 32', 'Ind 24', 'Ind 6']
df_freedom_train = df_train[col]
df_freedom_test = df_test[col]

lr = linear_model.LinearRegression()
lr.fit(df_freedom_train, freedom_train.values.ravel())
freedom_predictions = lr.predict(df_freedom_test)
errors = abs(freedom_predictions - freedom_test.values.ravel())
accuracy = r2_score(freedom_test.values.ravel(), freedom_predictions)
print('R2 score: ', accuracy)

df_freedom_train.to_pickle('data/freedom_train.pkl')
df_freedom_test.to_pickle('data/freedom_test.pkl')
#pd.DataFrame(fit.ranking_, index = range(1,34)).sort_values(by=[0])

Num Features:  1
Selected Features:  [False False False False False False False False False False False  True
 False False False False False False False False False False False False
 False False False False False False False False False]
Feature Ranking:  [24 15  4  5 14 16  8  6 12  3  2  1  9 33 23 32 17 19 22 21 20 13 11 10
 28 30 27 29 18 25 26  7 31]
R2 score:  0.25376855845231483


In [182]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, trust_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 20', 'Ind 13', 'Ind 17', 'Ind 14', 'Ind 25', 'Ind 22']
df_trust_train = df_train[col]
df_trust_test = df_test[col]

lr = linear_model.LinearRegression()
lr.fit(df_trust_train, trust_train.values.ravel())
trust_predictions = lr.predict(df_trust_test)
errors = abs(trust_predictions - trust_test.values.ravel())
accuracy = r2_score(trust_test.values.ravel(), trust_predictions)
print('R2 score: ', accuracy)

df_trust_train.to_pickle('data/trust_train.pkl')
df_trust_test.to_pickle('data/trust_test.pkl')
#pd.DataFrame(fit.ranking_, index =  range(1,34)).sort_values(by=[0])

Num Features:  1
Selected Features:  [False False False  True False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False]
Feature Ranking:  [23 28  2  1 19 33  8  7  6 11  4  5  9 13 27 15 10 30 16  3 25 17 29 18
 14 32 24 31 20 12 21 26 22]
R2 score:  0.2562066409953969


In [260]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, generosity_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 32', 'Ind 3', 'Ind 28', 'Ind 13', 'Ind 6', 'Ind 5', 'Ind 26', 'Ind 32']
df_generosity_train = df_train[col]
df_generosity_test = df_test[col]

lr = linear_model.LinearRegression()
lr.fit(df_generosity_train, generosity_train.values.ravel())
generosity_predictions = lr.predict(df_generosity_test)
errors = abs(generosity_predictions - generosity_test.values.ravel())
accuracy = r2_score(generosity_test.values.ravel(), generosity_predictions)
print('R2 score: ', accuracy)

df_generosity_train.to_pickle('data/generosity_train.pkl')
df_generosity_test.to_pickle('data/generosity_test.pkl')
#pd.DataFrame(fit.ranking_, index = range(1,34)).sort_values(by=[0])

Num Features:  1
Selected Features:  [False False  True False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False]
Feature Ranking:  [17 27  1  2 30 33 15  4 11 12 13 14 16 32 18 21 28 23  3 22 25 26  5 24
  8 10 31  7  9 20 29  6 19]
R2 score:  0.11032293421292771


In [294]:
model = linear_model.LinearRegression()
rfe = RFE(model, 1)
fit = rfe.fit(df_train, dystopia_train.values.ravel())
print("Num Features: ", fit.n_features_) 
print("Selected Features: ", fit.support_)
print("Feature Ranking: ", fit.ranking_)

col = ['Ind 8','Ind 23','Ind 6','Ind 29','Ind 27','Ind 31']
df_dystopia_train = df_train[col]
df_dystopia_test = df_test[col]

lr = linear_model.LinearRegression()
lr.fit(df_dystopia_train, dystopia_train.values.ravel())
dystopia_predictions = lr.predict(df_dystopia_test)
errors = abs(dystopia_predictions - dystopia_test.values.ravel())
accuracy = r2_score(dystopia_test.values.ravel(), dystopia_predictions)
#print('R2 score: ', accuracy)

df_dystopia_train.to_pickle('data/dystopia_train.pkl')
df_dystopia_test.to_pickle('data/dystopia_test.pkl')
#pd.DataFrame(fit.ranking_, index = range(33)).sort_values(by=[0])

Num Features:  1
Selected Features:  [False False False False False False False False False False  True False
 False False False False False False False False False False False False
 False False False False False False False False False]
Feature Ranking:  [ 7 21  9  5 14 31  8 11  4  3  1  2 10 26 27 30 33 18 13  6 19 15 20 12
 24 22 23 32 17 29 28 16 25]
R2 score:  -0.440846356133868


In [14]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
fit = pca.fit(df_test)
print("Explained Variance: ", fit.explained_variance_ratio_)
print("Fit Components: ", fit.components_)

Explained Variance:  [0.23465868 0.1913472  0.08820399]
Fit Components:  [[-3.85829613e-01 -1.28936544e-01  4.74083100e-03 -2.45124851e-02
  -9.97186595e-02 -1.13495466e-01  2.39352448e-01  3.39015306e-01
   2.08200754e-02 -4.00808505e-03 -9.41743146e-02 -1.12615196e-01
   2.05267129e-02  1.61084587e-01 -1.04065220e-01 -1.73000148e-01
   2.87735015e-01 -3.15181836e-02  2.74709274e-02 -8.63047894e-02
  -1.26235529e-01 -2.27997402e-03  3.24316618e-01  3.38676906e-01
  -9.23596201e-03  9.37811882e-02  3.01039340e-01  3.21061560e-01
  -4.67560476e-03 -7.15351491e-02 -5.22170234e-02 -5.63697036e-02
   9.09513665e-02]
 [ 7.49027072e-03  5.37070395e-02 -2.83348299e-02 -2.09588146e-02
   7.14216146e-03 -3.66236549e-02 -1.97435571e-04 -1.36403493e-02
   6.82763754e-01  5.91211533e-01  3.37189007e-01  7.78924680e-02
   3.67548299e-02  7.92529903e-02  1.23306000e-02 -2.03089799e-02
  -3.34636880e-02  1.18085874e-02 -3.15159992e-02  1.20584093e-02
   6.95003187e-02 -1.24989576e-02  2.95023151e-02 