中国证监会《上市公司行业分类指引》(2012年修订)将上市公司划分为19个门类和90个大类。中基协基金估值行业分类指数依据《上市公司行业分类指引》中的门类划分，编制16个门类指数(不包括制造业);依据制造业门类下的大类划分，编制27个大类指数，共有43条行业分类指数。

In [1]:
## load required libraries
import pandas as pd
import warnings
warnings.simplefilter("ignore")

In [2]:
## load amac_info.csv
industry_name_map = {}
data = pd.read_csv('non-manufacturing/amac_info.csv')
for index, row in data.iterrows():
	industry_name_map[row['industry']] = [row['description'], index]
industry_n = len(industry_name_map.keys())
print(industry_name_map)

{'H11030': ['AMAC农林牧渔指数', 0], 'H11031': ['AMAC采矿指数', 1], 'H11041': ['AMAC水电煤气指数', 2], 'H11042': ['AMAC建筑指数', 3], 'H11043': ['AMAC交运仓储指数', 4], 'H11044': ['AMAC信息技术指数', 5], 'H11045': ['AMAC批发零售贸易指数', 6], 'H11046': ['AMAC金融保险指数', 7], 'H11047': ['AMAC房地产指数', 8], 'H11049': ['AMAC文化体育指数', 9], 'H11050': ['AMAC综合企业指数', 10]}


In [3]:
## load amac_info.csv
data = pd.read_csv('non-manufacturing/amac.csv')
data = data[(data['day'] >= '2018-01-01') & (data['day'] <= '2019-12-31')]
data['ratio'] = (data['end'] - data['start'])/data['start']
data.head()

Unnamed: 0,industry,day,start,end,ratio
49,H11030,2019-12-31,2834.42,2877.95,0.015358
50,H11030,2019-12-30,2775.79,2815.0,0.014126
51,H11030,2019-12-27,2724.33,2779.0,0.020067
52,H11030,2019-12-26,2733.94,2715.82,-0.006628
53,H11030,2019-12-25,2746.24,2727.54,-0.006809


### Preprocess Data

In [4]:
data_asso = {}
for index, row in data.iterrows():
    if row['day'] not in data_asso:
        data_asso[row['day']] = [False] * industry_n * 2
    if row['ratio'] >= 0:
        data_asso[row['day']][2*industry_name_map[row['industry']][1]] = True
    else:
        data_asso[row['day']][2*industry_name_map[row['industry']][1]+1] = True

cols = []
for key in industry_name_map.keys():
    cols.append(industry_name_map[key][0] + '-第1天涨')
    cols.append(industry_name_map[key][0] + '-第1天跌')

for key in industry_name_map.keys():
    cols.append(industry_name_map[key][0] + '-第2天涨')
    cols.append(industry_name_map[key][0] + '-第2天跌')

data_asso_df = pd.DataFrame(columns = cols)
keys = sorted(list(data_asso.keys()))
line_n = 0
for i in range(len(keys)-1):
    data_asso_df.loc[line_n] = data_asso[keys[i]] + data_asso[keys[i+1]]
    line_n += 1

data_asso_df.head()

Unnamed: 0,AMAC农林牧渔指数-第1天涨,AMAC农林牧渔指数-第1天跌,AMAC采矿指数-第1天涨,AMAC采矿指数-第1天跌,AMAC水电煤气指数-第1天涨,AMAC水电煤气指数-第1天跌,AMAC建筑指数-第1天涨,AMAC建筑指数-第1天跌,AMAC交运仓储指数-第1天涨,AMAC交运仓储指数-第1天跌,...,AMAC批发零售贸易指数-第2天涨,AMAC批发零售贸易指数-第2天跌,AMAC金融保险指数-第2天涨,AMAC金融保险指数-第2天跌,AMAC房地产指数-第2天涨,AMAC房地产指数-第2天跌,AMAC文化体育指数-第2天涨,AMAC文化体育指数-第2天跌,AMAC综合企业指数-第2天涨,AMAC综合企业指数-第2天跌
0,True,False,True,False,True,False,True,False,False,True,...,True,False,False,True,True,False,True,False,True,False
1,True,False,True,False,True,False,True,False,True,False,...,True,False,False,True,True,False,False,True,True,False
2,True,False,True,False,False,True,False,True,True,False,...,True,False,False,True,True,False,True,False,True,False
3,True,False,True,False,True,False,True,False,False,True,...,True,False,True,False,True,False,False,True,True,False
4,True,False,True,False,True,False,True,False,False,True,...,True,False,True,False,True,False,False,True,False,True


### Generate Frequency Items
- By FP-Growth

In [5]:
from mlxtend.frequent_patterns import fpgrowth
frequent_itemsets = fpgrowth(data_asso_df, min_support = 0.2, use_colnames = True)
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.545267,(AMAC农林牧渔指数-第1天涨)
1,0.545267,(AMAC农林牧渔指数-第2天涨)
2,0.541152,(AMAC采矿指数-第1天涨)
3,0.541152,(AMAC采矿指数-第2天涨)
4,0.534979,(AMAC综合企业指数-第1天涨)


### Generate Association Rules

In [6]:
from mlxtend.frequent_patterns import association_rules
rules = association_rules(frequent_itemsets, metric = 'confidence', min_threshold = 0.6)
rules.head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(AMAC农林牧渔指数-第1天涨),(AMAC采矿指数-第1天涨),0.545267,0.541152,0.386831,0.709434,1.310969,0.091759,1.579151
1,(AMAC采矿指数-第1天涨),(AMAC农林牧渔指数-第1天涨),0.541152,0.545267,0.386831,0.714829,1.310969,0.091759,1.594595
2,"(AMAC农林牧渔指数-第2天涨, AMAC农林牧渔指数-第1天涨)",(AMAC采矿指数-第1天涨),0.312757,0.541152,0.228395,0.730263,1.34946,0.059146,1.701094
3,"(AMAC农林牧渔指数-第2天涨, AMAC采矿指数-第1天涨)",(AMAC农林牧渔指数-第1天涨),0.306584,0.545267,0.228395,0.744966,1.36624,0.061225,1.78303
4,(AMAC农林牧渔指数-第2天涨),(AMAC采矿指数-第2天涨),0.545267,0.541152,0.386831,0.709434,1.310969,0.091759,1.579151


In [7]:
check = []
for index, row in rules.iterrows():
    sign1st, sign2nd = True, True
    for val in row['antecedents']:
        if val.find('1') < 0:
            sign1st = False
            break
    for val in row['consequents']:
        if val.find('2') < 0:
            sign2nd = False
            break
    check.append(sign1st & sign2nd)
    
rules_s = rules[check & (rules['lift'] > 1.2)]
rules_s
rules_s.sort_values('lift', ascending = False).iloc[:10,[0,1,4,5,6]]

Unnamed: 0,antecedents,consequents,support,confidence,lift
132281,"(AMAC房地产指数-第1天跌, AMAC文化体育指数-第1天跌)",(AMAC金融保险指数-第2天涨),0.211934,0.609467,1.293455
132210,"(AMAC文化体育指数-第1天跌, AMAC建筑指数-第1天跌, AMAC交运仓储指数-第1天跌)",(AMAC金融保险指数-第2天涨),0.209877,0.607143,1.288522
132235,"(AMAC文化体育指数-第1天跌, AMAC信息技术指数-第1天跌, AMAC交运仓储指数-...",(AMAC金融保险指数-第2天涨),0.205761,0.60241,1.278476
171231,"(AMAC房地产指数-第1天跌, AMAC综合企业指数-第1天跌)",(AMAC金融保险指数-第2天涨),0.207819,0.60119,1.275889
116001,"(AMAC房地产指数-第1天涨, AMAC综合企业指数-第1天涨, AMAC水电煤气指数-第...",(AMAC农林牧渔指数-第2天涨),0.201646,0.695035,1.274669
202611,"(AMAC房地产指数-第1天跌, AMAC批发零售贸易指数-第1天跌)",(AMAC金融保险指数-第2天涨),0.216049,0.6,1.273362
116036,"(AMAC房地产指数-第1天涨, AMAC综合企业指数-第1天涨, AMAC批发零售贸易指数...",(AMAC农林牧渔指数-第2天涨),0.201646,0.685315,1.256841
115961,"(AMAC房地产指数-第1天涨, AMAC综合企业指数-第1天涨, AMAC建筑指数-第1天涨)",(AMAC农林牧渔指数-第2天涨),0.216049,0.681818,1.250429
19977,"(AMAC房地产指数-第1天涨, AMAC交运仓储指数-第1天涨, AMAC综合企业指数-第...",(AMAC农林牧渔指数-第2天涨),0.201646,0.680556,1.248113
116019,"(AMAC房地产指数-第1天涨, AMAC水电煤气指数-第1天涨, AMAC批发零售贸易指数...",(AMAC农林牧渔指数-第2天涨),0.201646,0.680556,1.248113
