# 城市项目数据统计

## 预设

In [1]:
import sys
import os
import openpyxl
import csv
import re
import pandas as pd
import jieba
import jieba.posseg as psg
import matplotlib.pyplot as plt
import numpy as np
import cn2an
import functools
from loguru import logger
logger.remove()
handler_id = logger.add(sys.stderr, level="DEBUG")
# 支持中文
plt.rcParams['font.sans-serif'] = ['SimSong-Regular'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 显示清晰
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [8]:
ExtraDictionaryPath = 'set/dict.txt'
StopWordPath = 'set/stopwords.txt'
KeywordTopNumber = 10
minLengthOfWord = 3
DictionaryFilePath = '../data/西安城市形象编码词表_数量分布.pkl'
DataFilePath = '../data/西安城市形象数据_关键词_议题分类.pkl'

## 函数工具

In [3]:
# 统计各个分类的发布数
def get_the_number_of_cases_of_classification(column_name, df):
    res_series = df.groupby(column_name)[column_name].count()
    if 1 in res_series.index.values:
        return res_series.loc[1]
    return 0

In [17]:
# 统计分组总数发布数
def get_the_number_of_cases_of_period(column_name, df, group_name):
    time_count_series = df.groupby(group_name).count().iloc[:,0]
    time_count_series.name = column_name
    # time_count_series.astype('int')
    return time_count_series

In [38]:
# pandas 获取分组统计数（不同app下）
def get_app_group_counts(df_data, group_name, app_name=None, column_name = None, is_fill_zero = False):
    if not app_name:
        res_series = df_data.groupby(group_name).agg('count').iloc[:,
                                                             0].astype('Int64')
    else:
        res_series = df_data[df_data['平台'] == app_name].groupby(group_name).agg(
        'count').iloc[:, 0].astype('Int64')
    if column_name:
        res_series.name = column_name
    if is_fill_zero:
        res_series = res_series.fillna('0')
    return res_series

## 导入数据

In [9]:
dictionary_data = pd.read_pickle(DictionaryFilePath)
data_after_classification = pd.read_pickle(DataFilePath)

## 数据处理

### 分类总数统计

In [21]:
dictionary_data['总数'] = dictionary_data['二级框架类型'].apply(get_the_number_of_cases_of_classification, args=(data_after_classification,))

In [23]:
dictionary_data['豆瓣总数'] = dictionary_data['二级框架类型'].apply(get_the_number_of_cases_of_classification, args=(data_after_classification[data_after_classification['平台']=='豆瓣'],))

In [34]:
dictionary_data['知乎总数'] = dictionary_data['二级框架类型'].apply(get_the_number_of_cases_of_classification, args=(data_after_classification[data_after_classification['平台']=='知乎'],))

In [36]:
dictionary_data['微博总数'] = dictionary_data['二级框架类型'].apply(get_the_number_of_cases_of_classification, args=(data_after_classification[data_after_classification['平台']=='微博'],))

### 时间分布统计

In [16]:
df = data_after_classification
period_data = pd.DataFrame()

In [20]:
period_data['总发布数'] = get_the_number_of_cases_of_period('发布数', df, '发布时间月份')
for v in dictionary_data.index:
    period_data[f'{v}发布数'] = get_the_number_of_cases_of_period(f'{v}发布数', df[df[v] == 1], '发布时间月份')
# period_data = period_data.fillna('0').astype('int')

In [37]:
period_data

Unnamed: 0_level_0,总发布数,政治环境发布数,经济发展发布数,生态建设发布数,文化风俗发布数,重大事件发布数,司法治安发布数,医疗卫生发布数,基础设施发布数,科教文艺发布数,休闲娱乐发布数,社会民生发布数,市民样貌发布数,积极判断框架发布数,消极判断框架发布数,事件框架发布数,细节框架发布数
发布时间月份,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2017-08,738,179,137,33,32,14,51,43,79,140,213,138,15,217,61,175,112
2017-09,790,120,115,40,72,20,51,15,85,155,256,139,12,210,41,127,109
2017-10,755,158,103,24,53,22,32,31,62,185,247,138,7,220,37,130,105
2017-11,958,179,162,45,76,30,87,50,117,261,253,167,16,227,73,189,162
2017-12,762,142,115,38,51,7,41,33,93,184,218,147,15,187,51,129,117
2018-01,1138,204,175,75,82,7,78,37,181,264,342,251,18,282,115,232,157
2018-02,817,171,114,33,106,14,55,19,81,126,306,175,15,278,79,167,99
2018-03,6105,1170,718,222,336,43,206,190,523,1162,1496,1061,92,1268,357,1190,697
2018-04,4602,616,482,158,336,46,145,142,408,916,1262,833,47,970,282,589,475
2018-05,4472,715,572,153,342,97,125,158,419,879,1211,749,73,946,292,698,480


In [57]:
for v in ['豆瓣','知乎','微博']:
    period_data[f'{v}发布数'] = get_the_number_of_cases_of_period(f'{v}发布数', df[df['平台'] == v])
period_data = period_data.fillna('0').astype('int')

## 展示&保存数据

In [58]:
df_to_show = period_data

In [59]:
display(df_to_show)

Unnamed: 0_level_0,总发布数,文化风俗发布数,政治环境发布数,经济发展发布数,生态建设发布数,重大事件发布数,司法治安发布数,医疗卫生发布数,基础设施发布数,科教文艺发布数,休闲娱乐发布数,社会民生发布数,市民样貌发布数,积极判断框架发布数,消极判断框架发布数,事件框架发布数,细节框架发布数,豆瓣发布数,知乎发布数,微博发布数
发布时间,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2009-04,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2009-06,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2009-08,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2009-11,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2009-12,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-06,1903,43,91,93,1,11,56,4,50,95,120,313,30,41,6,220,347,17,827,1059
2022-07,2254,121,95,58,1,15,44,38,46,116,153,341,22,31,16,321,358,24,894,1336
2022-08,752,37,46,30,0,11,4,3,19,41,50,151,7,22,2,101,153,72,571,109
2022-09,96,2,17,16,0,14,0,0,2,18,3,26,1,4,0,19,24,96,0,0


### 保存数据

In [60]:
df_to_show.to_pickle('data/西安项目时间统计结果test.pkl')