In [36]:
from copy import deepcopy

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import cross_val_score, ShuffleSplit

xData, yData = load_boston(return_X_y=True)

In [30]:
xData = pd.DataFrame(xData, columns=['feature[' + str(index) + ']' for index in range(1, xData.shape[1] + 1)])

In [31]:
xData['target'] = yData

In [32]:
data = deepcopy(xData)

In [33]:
data

Unnamed: 0,feature[1],feature[2],feature[3],feature[4],feature[5],feature[6],feature[7],feature[8],feature[9],feature[10],feature[11],feature[12],feature[13],target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
5,0.02985,0.0,2.18,0.0,0.458,6.430,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.60,12.43,22.9
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.90,19.15,27.1
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.10,18.9


In [41]:
def getFeaturesCorrelationRegression(_dataSet, _targetName):
	importanceList = []
	shuffleSplit = ShuffleSplit(n_splits = 10, test_size = 0.25)
	
	trainXData, testXData = pd.DataFrame(), pd.DataFrame()
	for trainIndices, testIndices in shuffleSplit.split(data):
		trainXData = pd.concat((trainXData, data.iloc[trainIndices]))
		testXData = pd.concat((testXData, data.iloc[testIndices]))
	
	modelsList = []
	for numberOfDistributions in range(1, 11):
		modelsList.append(GaussianMixture(numberOfDistributions).fit(trainXData['target'].values.reshape(-1, 1)))
	
	aicScores = [m.aic(trainXData['target'].values.reshape(-1, 1)) for m in modelsList]
	trainXData['target[AIC]'] = np.argmax(modelsList[aicScores.index(np.min(aicScores))].predict_proba(trainXData['target'].values.reshape(-1, 1)), axis = 1)
	testXData['target[AIC]'] = np.argmax(modelsList[aicScores.index(np.min(aicScores))].predict_proba(testXData['target'].values.reshape(-1, 1)), axis = 1)
	
	bicScores = [m.bic(trainXData['target'].values.reshape(-1, 1)) for m in modelsList]
	trainXData['target[BIC]'] = np.argmax(modelsList[bicScores.index(np.min(bicScores))].predict_proba(trainXData['target'].values.reshape(-1, 1)), axis = 1)
	testXData['target[BIC]'] = np.argmax(modelsList[bicScores.index(np.min(bicScores))].predict_proba(testXData['target'].values.reshape(-1, 1)), axis = 1)
	
	# print(aicScores, bicScores)
	
	for feature in sorted([value for value in list(data) if value not in ['target']]):
		trainXDataAIC = deepcopy(trainXData[[feature, 'target[AIC]']])
		testXDataAIC = deepcopy(testXData[[feature, 'target[AIC]']])
		trainXDataAICGroup = trainXDataAIC.groupby('target[AIC]').agg(np.mean)
		testXDataAICGroup = testXDataAIC.groupby('target[AIC]').agg(np.mean)
		
		trainXDataBIC = deepcopy(trainXData[[feature, 'target[BIC]']])
		testXDataBIC = deepcopy(testXData[[feature, 'target[BIC]']])
		trainXDataBICGroup = trainXDataBIC.groupby('target[BIC]').agg(np.mean)
		testXDataBICGroup = testXDataBIC.groupby('target[BIC]').agg(np.mean)
		
		print(trainXDataAICGroup, trainXDataBICGroup)
		
		try:
			importanceList.append((feature,
			                       np.round(scipy.stats.pearsonr(trainXDataAICGroup[feature], testXDataAICGroup[feature])[0], 7),
			                       np.round(scipy.stats.pearsonr(trainXDataBICGroup[feature], testXDataBICGroup[feature])[0], 7)))
		
		except Exception as _:
			importanceList.append((feature, np.nan))
	
	return importanceList

In [42]:
featureImportances = getFeaturesCorrelationRegression(data, 'target')

print(featureImportances)

aicValues = sorted(list(set([value[1] for value in featureImportances])))
bicValues = sorted(list(set([value[2] for value in featureImportances])))

featureImportances = [(value[0], aicValues.index(value[1]) + bicValues.index(value[2])) for value in featureImportances]

featureImportancesDictionary = {}
for value in featureImportances:
	
	if value[1] in featureImportancesDictionary.keys():
		featureImportancesDictionary[value[1]].append(value[0])
	
	else:
		featureImportancesDictionary[value[1]] = [value[0]]

featureImportances = [(key, value) for key, value in featureImportancesDictionary.items()]
featureImportances.sort(key = lambda tup: tup[0], reverse = True)

scoresList = []
columnsToRemove = []

xData = data[[value for value in list(data) if value not in ['target'] + columnsToRemove]]
yData = data['target']

model = LinearRegression()
cvScores = cross_val_score(model, xData, yData, cv = ShuffleSplit(n_splits = 10, test_size = 0.25), scoring = 'neg_mean_squared_error')

# print(columnsToRemove, np.mean(cvScores), np.std(cvScores))

scoresList.append(np.mean(cvScores))

for _, column in featureImportances:
	columnsToRemove.extend(column)
	
	xData = data[[value for value in list(data) if value not in ['target'] + columnsToRemove]]
	yData = data['target']
	
	if len(list(xData)) == 0:
		continue
	
	model = LinearRegression()
	cvScores = cross_val_score(model, xData, yData, cv = ShuffleSplit(n_splits = 10, test_size = 0.25), scoring = 'neg_mean_squared_error')
	
	# print(columnsToRemove, np.mean(cvScores), np.std(cvScores))
	
	scoresList.append(np.mean(cvScores))

plt.plot(scoresList)
plt.show()

             feature[10]
target[AIC]             
0             420.539474
1             327.695652
2             424.565574
3             668.081851
4             337.374833
5             535.306228
6             267.443750
7             284.312195
8             281.453782
9             389.856622              feature[10]
target[BIC]             
0             420.539474
1             327.695652
2             424.565574
3             668.081851
4             337.374833
5             535.306228
6             267.443750
7             284.312195
8             281.453782
9             389.856622
             feature[11]
target[AIC]             
0              19.248246
1              17.748495
2              16.605738
3              20.195374
4              17.956627
5              19.610035
6              16.707500
7              16.460976
8              15.807563
9              18.892710              feature[11]
target[BIC]             
0              19.248246
1              17.748495


IndexError: tuple index out of range