In [24]:
# import requirred packages
import pandas as pd
import numpy as np
import scipy as sp
import os
from datetime import datetime
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [25]:
Data=pd.read_csv('physical_activity_v6.csv')

In [26]:
Data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,LSOA,%Crime,air_quality,Sum_Sportsarea,distance_to_nearest_garden,walkable_bus_stop_count,walkability_score,Workweight_average,Income_Score,% Disability,%Health_Cov
0,0,0,E01011264,1.56,0.02468,0,690.015429,17,0.351029,31.16,0.088,0.213,0.0
1,1,1,E01011265,13.4,-0.224271,11,300.107012,26,0.454218,38.34,0.08,0.177,0.0
2,2,2,E01011266,2.34,-0.480771,9,1192.914356,5,0.387104,45.2,0.021,0.098,0.0
3,3,3,E01011267,7.3,-0.182079,4,425.536101,24,0.469677,41.62,0.153,0.2,0.0
4,4,4,E01011268,7.87,-0.182079,10,389.151811,26,0.485006,34.14,0.161,0.224,0.0


In [27]:
Data.to_csv('Data.csv')
Data=Data.set_index('LSOA')

In [28]:
indicator_cols=[
    #Environment
    '%Crime',
    'air_quality',
    #Social-economic
    'Income_Score',
    'Workweight_average',
    #Physical Infrastructure 
    'distance_to_nearest_garden',
    'Sum_Sportsarea',
    'walkable_bus_stop_count',
    'walkability_score',
    #Health
    '% Disability',
    '%Health_Cov'
    ]

# create a new dataframe
priority_places =Data[indicator_cols].copy()

# Remove any duplicate rows (if any)
priority_places = priority_places.drop_duplicates()

#### The raw indicator values are oriented so that the highest values in each indicator represent those higher priority places. #### 

# The first task is to orient each indicator in the correct direction
# i.e. so that high values correspond to higher priority places
priority_places = pd.concat([1 * priority_places[['Income_Score','distance_to_nearest_garden', 
   '%Crime', 'air_quality','% Disability', 'Workweight_average']], 
                  -1 * priority_places[[ 'walkability_score','walkable_bus_stop_count',
                                        'Sum_Sportsarea', '%Health_Cov'
                                       ]]
                            ], axis=1)




In [29]:
priority_places_ranked = priority_places.rank(method='min', ascending=False).astype(int)
priority_places_ranked.head()

Unnamed: 0_level_0,Income_Score,distance_to_nearest_garden,%Crime,air_quality,% Disability,Workweight_average,walkability_score,walkable_bus_stop_count,Sum_Sportsarea,%Health_Cov
LSOA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
E01011264,264,286,450,416,70,419,83,82,1,1
E01011265,282,436,141,453,195,243,349,179,447,1
E01011266,472,144,422,473,479,107,162,19,428,1
E01011267,186,388,266,448,108,164,382,155,339,1
E01011268,176,404,248,448,37,357,419,179,438,1


In [30]:
for c in priority_places_ranked[indicator_cols].columns:
    mean=priority_places_ranked[c].mean()
    std=priority_places_ranked[c].std()
    priority_places_ranked[c] = (priority_places_ranked[c] - mean) /std

priority_places_ranked.head()

Unnamed: 0_level_0,Income_Score,distance_to_nearest_garden,%Crime,air_quality,% Disability,Workweight_average,walkability_score,walkable_bus_stop_count,Sum_Sportsarea,%Health_Cov
LSOA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
E01011264,0.145593,0.294289,1.459142,1.215157,-1.227502,1.238772,-1.145246,-1.122667,-1.2044,-1.171491
E01011265,0.273648,1.357985,-0.733322,1.476829,-0.339685,-0.009666,0.741042,-0.435849,1.451883,-1.171491
E01011266,1.625345,-0.712676,1.260472,1.618273,1.677436,-0.974368,-0.585033,-1.568745,1.338722,-1.171491
E01011267,-0.409314,1.017603,0.153597,1.441468,-0.957605,-0.570045,0.975055,-0.605783,0.808657,-1.171491
E01011268,-0.480456,1.131063,0.025881,1.441468,-1.461885,0.798981,1.237433,-0.435849,1.39828,-1.171491


In [31]:
### Domains are constructed by averaging over each indicator within each domain ####
#Combine transformed indicators into domains (take the average of the transformed indicators)
priority_places_ranked['domain_Environment'] = 0.5 * priority_places_ranked[[ '%Crime','air_quality']].sum(axis=1)
priority_places_ranked['domain_Social_economic'] = 0.5 * priority_places_ranked[['Income_Score', 'Workweight_average']].sum(axis=1)
priority_places_ranked['domain_Physical Infrastructure'] = (1./4.) * priority_places_ranked[['walkable_bus_stop_count', 'Sum_Sportsarea','distance_to_nearest_garden','walkability_score']].sum(axis=1)
priority_places_ranked['domain_Health'] = 0.5 * priority_places_ranked[[ '% Disability','%Health_Cov']].sum(axis=1)

# define the domain columns 
domain_columns = ['domain_Environment', 
                  'domain_Social_economic',
                  'domain_Physical Infrastructure',
                  'domain_Health'
                 ]

priority_places_ranked.head()

Unnamed: 0_level_0,Income_Score,distance_to_nearest_garden,%Crime,air_quality,% Disability,Workweight_average,walkability_score,walkable_bus_stop_count,Sum_Sportsarea,%Health_Cov,domain_Environment,domain_Social_economic,domain_Physical Infrastructure,domain_Health
LSOA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
E01011264,0.145593,0.294289,1.459142,1.215157,-1.227502,1.238772,-1.145246,-1.122667,-1.2044,-1.171491,1.33715,0.692182,-0.794506,-1.199496
E01011265,0.273648,1.357985,-0.733322,1.476829,-0.339685,-0.009666,0.741042,-0.435849,1.451883,-1.171491,0.371753,0.131991,0.778765,-0.755588
E01011266,1.625345,-0.712676,1.260472,1.618273,1.677436,-0.974368,-0.585033,-1.568745,1.338722,-1.171491,1.439372,0.325488,-0.381933,0.252972
E01011267,-0.409314,1.017603,0.153597,1.441468,-0.957605,-0.570045,0.975055,-0.605783,0.808657,-1.171491,0.797532,-0.489679,0.548883,-1.064548
E01011268,-0.480456,1.131063,0.025881,1.441468,-1.461885,0.798981,1.237433,-0.435849,1.39828,-1.171491,0.733674,0.159263,0.832732,-1.316688


In [32]:
#PCA
Data_sub=priority_places_ranked[[ 'domain_Social_economic','domain_Environment', 
                 
                  'domain_Physical Infrastructure',
                  'domain_Health']]
pca = PCA(n_components=4)  
X_pca = pca.fit_transform(Data_sub)
#print(pca.explained_variance_ratio_)
print("Explained variance ratio:", pca.explained_variance_ratio_)

# 打印主成分的载荷（即每个主成分的系数）
print("Component loadings:\n", pca.components_)


Explained variance ratio: [0.48302873 0.25868403 0.15298978 0.10529745]
Component loadings:
 [[ 2.76109738e-01  8.47716218e-01 -4.30836117e-01 -1.39717095e-01]
 [-1.39418624e-01 -1.16994510e-01 -6.71928117e-04 -9.83297656e-01]
 [-3.90848950e-01 -2.97819019e-01 -8.66128687e-01  9.14441203e-02]
 [ 8.66927308e-01 -4.23076034e-01 -2.53378937e-01 -7.24073591e-02]]


In [33]:
pca = PCA(n_components=4)
X_pca = pca.fit_transform(Data_sub)
 
# Explained Variance Ratio
print("Explained Variance Ratio:", pca.explained_variance_ratio_)
 
# Loop through each principal component and print out the weights for each domain
for i, component in enumerate(pca.components_):
    print(f"Principal Component {i+1}:")
    for weight, name in zip(component, Data_sub.columns):
        print(f"{name}: {weight:.4f}")
    print("\n")

Explained Variance Ratio: [0.48302873 0.25868403 0.15298978 0.10529745]
Principal Component 1:
domain_Social_economic: 0.2761
domain_Environment: 0.8477
domain_Physical Infrastructure: -0.4308
domain_Health: -0.1397


Principal Component 2:
domain_Social_economic: -0.1394
domain_Environment: -0.1170
domain_Physical Infrastructure: -0.0007
domain_Health: -0.9833


Principal Component 3:
domain_Social_economic: -0.3908
domain_Environment: -0.2978
domain_Physical Infrastructure: -0.8661
domain_Health: 0.0914


Principal Component 4:
domain_Social_economic: 0.8669
domain_Environment: -0.4231
domain_Physical Infrastructure: -0.2534
domain_Health: -0.0724




In [34]:
#priority_places_domains = priority_places_ranked[domain_columns + ['LSOA']]
#### The domains are then combined using the domain weights shown in Table 2. ####
priority_places_ranked['combined'] = (0.53) * Data_sub['domain_Environment'] + (0.32) * Data_sub['domain_Social_economic'] 
+ (0.37) * Data_sub['domain_Physical Infrastructure']+ (0.34) * Data_sub['domain_Health'] 

priority_places_ranked.head()


Unnamed: 0_level_0,Income_Score,distance_to_nearest_garden,%Crime,air_quality,% Disability,Workweight_average,walkability_score,walkable_bus_stop_count,Sum_Sportsarea,%Health_Cov,domain_Environment,domain_Social_economic,domain_Physical Infrastructure,domain_Health,combined
LSOA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
E01011264,0.145593,0.294289,1.459142,1.215157,-1.227502,1.238772,-1.145246,-1.122667,-1.2044,-1.171491,1.33715,0.692182,-0.794506,-1.199496,0.930188
E01011265,0.273648,1.357985,-0.733322,1.476829,-0.339685,-0.009666,0.741042,-0.435849,1.451883,-1.171491,0.371753,0.131991,0.778765,-0.755588,0.239266
E01011266,1.625345,-0.712676,1.260472,1.618273,1.677436,-0.974368,-0.585033,-1.568745,1.338722,-1.171491,1.439372,0.325488,-0.381933,0.252972,0.867024
E01011267,-0.409314,1.017603,0.153597,1.441468,-0.957605,-0.570045,0.975055,-0.605783,0.808657,-1.171491,0.797532,-0.489679,0.548883,-1.064548,0.265995
E01011268,-0.480456,1.131063,0.025881,1.441468,-1.461885,0.798981,1.237433,-0.435849,1.39828,-1.171491,0.733674,0.159263,0.832732,-1.316688,0.439811


In [35]:
priority_places_deciles=priority_places_ranked[['domain_Physical Infrastructure','domain_Health','domain_Environment','domain_Social_economic','combined']]
#priority_places_full = priority_places_ranked.merge(priority_places_deciles, left_index=True, right_index=True, suffixes=('', '_decile'))

priority_places_deciles['domain_Physical Infrastructure']= pd.to_numeric(pd.qcut(priority_places_deciles['domain_Physical Infrastructure'], 10, duplicates='drop', labels=range(1,11)))
priority_places_deciles['domain_Health']= pd.to_numeric(pd.qcut(priority_places_deciles['domain_Health'], 10, duplicates='drop', labels=range(1,11)))
priority_places_deciles['domain_Social_economic']= pd.to_numeric(pd.qcut(priority_places_deciles['domain_Social_economic'], 10, duplicates='drop', labels=range(1,11)))
priority_places_deciles['domain_Environment']= pd.to_numeric(pd.qcut(priority_places_deciles['domain_Environment'], 10, duplicates='drop', labels=range(1,11)))
priority_places_deciles['combined']= pd.to_numeric(pd.qcut(priority_places_deciles['combined'], 10, duplicates='drop', labels=range(1,11)))
priority_places_deciles.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority_places_deciles['domain_Physical Infrastructure']= pd.to_numeric(pd.qcut(priority_places_deciles['domain_Physical Infrastructure'], 10, duplicates='drop', labels=range(1,11)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  priority_places_deciles['domain_Health']= pd.to_numeric(pd.qcut(priority_places_deciles['domain_Health'], 10, duplicates='drop', labels=range(1,11)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cave

Unnamed: 0_level_0,domain_Physical Infrastructure,domain_Health,domain_Environment,domain_Social_economic,combined
LSOA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
E01011264,2,1,10,9,10
E01011265,9,2,7,7,7
E01011266,4,7,10,8,10
E01011267,8,2,8,2,7
E01011268,9,1,8,7,8


In [36]:
priority_places_deciles.to_csv('priority_places_deciles.csv')