# 情感曲线可视化

## 预设

### 导入库

In [None]:
import pandas as pd
from utils import *
import matplotlib.pyplot as plt
import matplotlib

### 全局参数

In [None]:
BulDataFilePath = f'data/bullet_chats_sentiment.pkl'
SubDataFilePath = f'data/subtitle_sentiment.pkl'

### 全局设置

In [None]:
# 支持中文
print(matplotlib.matplotlib_fname())
a=sorted([f.name for f in matplotlib.font_manager.fontManager.ttflist])
for i in a:
    if 'Song' in i:
        print(i)
plt.rcParams['font.sans-serif'] = ['Songti SC'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 显示清晰
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## 函数工具

In [None]:
def time_cut(df, time_column_name, time_step = 1000):
    bin = range(0, df[time_column_name].iloc[-1]+time_step, time_step)
    df.loc[:,f'{time_column_name}_group']=pd.cut(df[time_column_name],bin,right=False)
    return df

## 数据读取

In [None]:
bullet_data = pd.read_pickle(BulDataFilePath)
subtitle_data = pd.read_pickle(SubDataFilePath)

In [None]:
bullet_data = bullet_data[bullet_data['confidence'].notna()]
subtitle_data = subtitle_data[subtitle_data['confidence'].notna()]

In [None]:
bullet_data['progress'] = bullet_data['progress'] + 30*1000

## 数据分析&可视化&保存

### 区间划分

In [None]:
bullet_data = time_cut(bullet_data, 'progress', 30*1000)
subtitle_data = time_cut(subtitle_data, 'progress', 30*1000)

In [None]:
statistics_data_list = []
for i in range(6):
    num = i+1
    bullet_data_mean = bullet_data[bullet_data['episode']==num].groupby('progress_group')[['confidence','negative_prob','positive_prob','sentiment']].agg('mean')
    subtitle_data_mean = subtitle_data[subtitle_data['episode']==num].groupby('progress_group')[['confidence','negative_prob','positive_prob','sentiment']].agg('mean')
    statistics_data = bullet_data_mean.join(subtitle_data_mean,lsuffix='_bullet', rsuffix='_subtitle')
    statistics_data_list.append(statistics_data)

In [None]:
for i in range(len(statistics_data_list)):
    ax = statistics_data_list[i].loc[:, ['sentiment_bullet','sentiment_subtitle']].plot(
        figsize=(16, 10),
        secondary_y=['gmv'],
        x_compat=True,
        grid=True)
    ax.grid(visible=True, linestyle="--", alpha=0.3)
    plt.savefig(f'pic/{i}对比图.png')

## 测试&playground

In [None]:
def get_data_form_time_cut(episode, progress_start, progress_end, bullet_data, subtitle_data):
    bullet_res = bullet_data[(bullet_data['episode']==6)&(bullet_data['progress']<progress_end)&(bullet_data['progress']>=progress_start)][['positive_prob','content','progress']]
    bullet_res['type'] = 'bullet'
    subtitle_res = subtitle_data[(subtitle_data['episode']==6)&(subtitle_data['progress']<progress_end)&(subtitle_data['progress']>=progress_start)][['positive_prob','content','progress']]
    subtitle_res['type'] = 'subtitle'
    return pd.concat([bullet_res, subtitle_res], axis=1)

In [None]:
point1 = get_data_form_time_cut(6, 630000, 660000, bullet_data, subtitle_data)
point1.to_excel('res/point1.xlsx')

In [None]:
point2 = get_data_form_time_cut(6, 1930000, 1960000, bullet_data, subtitle_data)
point2.to_excel('res/point2.xlsx')

In [None]:
point3 = get_data_form_time_cut(6, 1860000, 1890000, bullet_data, subtitle_data)
point3.to_excel('res/point3.xlsx')

In [None]:
point4 = get_data_form_time_cut(6, 1380000, 1410000, bullet_data, subtitle_data)
point4.to_excel('res/point4.xlsx')

In [None]:
point5 = get_data_form_time_cut(6, 1170000, 1200000, bullet_data, subtitle_data)
point5.to_excel('res/point5.xlsx')