# 城市项目数据统计

## 预设

In [4]:
import sys
import os
import openpyxl
import csv
import re
import pandas as pd
import jieba
import jieba.posseg as psg
import matplotlib.pyplot as plt
import numpy as np
import cn2an
import functools
from loguru import logger
logger.remove()
handler_id = logger.add(sys.stderr, level="DEBUG")
# 支持中文
plt.rcParams['font.sans-serif'] = ['SimSong-Regular'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 显示清晰
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [5]:
ExtraDictionaryPath = 'set/dict.txt'
StopWordPath = 'set/stopwords.txt'
KeywordTopNumber = 10
minLengthOfWord = 3
DictionaryFilePath = 'data/西安项目编码表.pkl'
DataFilePath = 'data/西安项目分类后.pkl'

## 函数工具

In [6]:
# 统计各个分类的发布数
def get_the_number_of_cases_of_classification(column_name, df):
    res_series = df.groupby(column_name)[column_name].count()
    if 1 in res_series.index.values:
        return res_series.loc[1]
    return 0

In [39]:
# 统计时间发布数
def get_the_number_of_cases_of_period(name, df):
    time_count_series = df.groupby('发布时间').count().iloc[:,0]
    time_count_series.name = name
    # time_count_series.astype('int')
    return time_count_series

## 导入数据

In [34]:
dictionary_data = pd.read_pickle(DictionaryFilePath)
data_after_classification = pd.read_pickle(DataFilePath)

## 数据处理

### 分类总数统计

In [21]:
dictionary_data['总数'] = dictionary_data['二级框架类型'].apply(get_the_number_of_cases_of_classification, args=(data_after_classification,))

In [23]:
dictionary_data['豆瓣总数'] = dictionary_data['二级框架类型'].apply(get_the_number_of_cases_of_classification, args=(data_after_classification[data_after_classification['平台']=='豆瓣'],))

In [34]:
dictionary_data['知乎总数'] = dictionary_data['二级框架类型'].apply(get_the_number_of_cases_of_classification, args=(data_after_classification[data_after_classification['平台']=='知乎'],))

In [36]:
dictionary_data['微博总数'] = dictionary_data['二级框架类型'].apply(get_the_number_of_cases_of_classification, args=(data_after_classification[data_after_classification['平台']=='微博'],))

### 时间分布统计

In [35]:
df = data_after_classification
df['发布时间'] = df['发布时间'].dt.strftime('%Y-%m')
period_data = pd.DataFrame()

In [43]:
for i, v in dictionary_data['二级框架类型'].iteritems():
    print(v)

政治环境
经济发展
生态建设
文化风俗
重大事件
司法治安
医疗卫生
基础设施
科教文艺
休闲娱乐
社会民生
市民样貌
积极判断框架
消极判断框架
事件框架
细节框架


In [47]:
period_data['总发布数'] = get_the_number_of_cases_of_period('总发布数', df)
for i, v in dictionary_data['二级框架类型'].iteritems():
    period_data[f'{v}发布数'] = get_the_number_of_cases_of_period(f'{v}总发布数', df[df[v] == 1])
period_data = period_data.fillna('0').astype('int')

In [57]:
for v in ['豆瓣','知乎','微博']:
    period_data[f'{v}发布数'] = get_the_number_of_cases_of_period(f'{v}发布数', df[df['平台'] == v])
period_data = period_data.fillna('0').astype('int')

## 展示&保存数据

In [58]:
df_to_show = period_data

In [59]:
display(df_to_show)

Unnamed: 0_level_0,总发布数,文化风俗发布数,政治环境发布数,经济发展发布数,生态建设发布数,重大事件发布数,司法治安发布数,医疗卫生发布数,基础设施发布数,科教文艺发布数,休闲娱乐发布数,社会民生发布数,市民样貌发布数,积极判断框架发布数,消极判断框架发布数,事件框架发布数,细节框架发布数,豆瓣发布数,知乎发布数,微博发布数
发布时间,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2009-04,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2009-06,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2009-08,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2009-11,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2009-12,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-06,1903,43,91,93,1,11,56,4,50,95,120,313,30,41,6,220,347,17,827,1059
2022-07,2254,121,95,58,1,15,44,38,46,116,153,341,22,31,16,321,358,24,894,1336
2022-08,752,37,46,30,0,11,4,3,19,41,50,151,7,22,2,101,153,72,571,109
2022-09,96,2,17,16,0,14,0,0,2,18,3,26,1,4,0,19,24,96,0,0


### 保存数据

In [60]:
df_to_show.to_pickle('data/西安项目时间统计结果test.pkl')