In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import datetime
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
data=pd.read_csv('.csv')
data.head()

In [2]:
#设置默认参数
pd.options.display.max_columns = 100
pd.options.display.max_rows = 300
pd.set_option('display.float_format', lambda x: '%.4f' % x)
np.set_printoptions(precision=4,suppress=True)
np.random.seed(12345)
#设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['figure.dpi'] =150
plt.rc('figure',figsize=(10,6))

In [None]:
data.columns

In [4]:
aa_start='2022-01-01'
ab_start='2022-03-01'
ab_end='2022-05-01'
eval_list=['深圳市','武汉市','青岛市','福州市']
eval_city='深圳市'

In [None]:
data1=data.loc[(data.dt>=aa_start)&(data.dt<=ab_end)].copy()
data1.sort_values(by=['city_id','dt'],inplace=True)
len(data1['城市名称'].unique())

In [None]:
data1.reset_index(drop=True,inplace=True)
data1['dt']=pd.to_datetime(data1['dt'])
data1['month']=data1['dt'].dt.month
data1.head()

In [None]:
data1.isnull().sum()

In [5]:

metrics=['city_level','a','b','c','d']

In [None]:
data1.replace('-',np.nan,inplace=True)
data1.dropna(axis=0,how='any',inplace=True)

In [None]:
data1[metrics]=data1[metrics].astype('float')

In [None]:
cluster_data=data1[data1.dt<ab_start].groupby(['month','city_id'])[metrics].mean().reset_index()
cluster_data.head()

In [None]:
# 建立数据透视表
pivot_data = pd.pivot_table(cluster_data, index=['city_id'], columns=[
                            'month'], values=metrics, aggfunc='sum')
pivot_data.head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
features=pivot_data.fillna(0).values
scaler=StandardScaler()
scaler_features=scaler.fit_transform(features)

In [None]:
scaler_features

In [None]:
scaler_features.shape

In [None]:
pca=PCA()
pca.fit(scaler_features)

In [None]:
pca.explained_variance_ratio_

In [None]:
# 绘制主成分个数与累计方差百分比的关系
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(pca.explained_variance_ratio_)+1),
         np.cumsum(pca.explained_variance_ratio_), 'ro-')


In [None]:
pca=PCA(n_components=6)
pca.fit(scaler_features)
pca_features=pca.transform(scaler_features)

In [None]:

from sklearn.cluster import KMeans
kmeans_kwargs = {'init': 'random', 'n_init': 10,
                 'max_iter': 300, 'random_state': 42}
sse = {}
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(pca_features)
    sse[k] = kmeans.inertia_
plt.figure(figsize=(10, 6))
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel('number of clusters')
plt.ylabel('SSE')
plt.show()


In [None]:
kmeans=KMeans(init='random',n_clusters=4,n_init=10,max_iter=300,random_state=42)
kmeans.fit(pca_features)

In [None]:
pivot_data['cluster']=kmeans.labels_
cluster_res=pivot_data.reset_index()[['city_id','cluster']]
cluster_res.loc[cluster_res['城市名称']==eval_city]
cluster_res.groupby('cluster').size()
cluster_res

In [None]:
cluster_res.loc[cluster_res['cluster']==2].values

In [None]:
# 合成控制
from DiDiSCM import SyntheticControlMethod
synth1 = SyntheticControlMethod(data=df, outcome='pinle_callnum', unit='city_id',
                                time='dt', treated_units=160, treated_time=datetime.strptime('ab_start', '%Y-%m-%d'))
inverted = synth1.data_prep(
    precictors=['pinle_callnum', 'pinle_subsidyb', 'pinle_subsidyc', 'driver_num'])
inverted.head()


In [None]:
synth1.get_treatment_effect(lasso=True,save_result=True)


In [None]:
synth1.check_significance(poor_fit_included=True)
a1=synth1.effect[synth1.effect.after_treatment==1].mean()
a1
a1.effect/a1.synthectic*100
synth1.effect[synth1.effect.after_treatment].sum()

In [None]:

synth2 = SyntheticControlMethod(data=df, outcome='pinle_finishnum', unit='city_id',
                                time='dt', treated_units=160, treated_time=datetime.strptime('ab_start', '%Y-%m-%d'))
inverted = synth2.data_prep(
    precictors=['pinle_finishnum', 'pinle_subsidyb', 'pinle_subsidyc', 'driver_num'])
inverted.head()


In [None]:
synth2.get_treatment_effect(lasso=True,save_result=True)

In [None]:
synth2.check_significance(poor_fit_included=True)

In [None]:
synth2.rank.head()

In [None]:
a2=synth2.effect[synth2.effect.after_treatment==1].mean()
a2
a2.effect/a2.synthectic*100
synth2.effect[synth2.effect.after_treatment].sum()