## Part2 数据分析

<font color='red'>*若已有数据集可以直接从此开始*</font>

此`notrbook`基于`pyecharts`

### 导入相关库和设置

In [None]:
import csv

from settings import dataset_file
from settings import csv_title


### 绘制友邻分布地图

In [None]:
from pyecharts import options as opts
from pyecharts.charts import Geo

from settings import csv_title
from settings import loc_range
from settings import loc_dic

# 人数
loc_num = list(0 for _ in loc_range)

with open(dataset_file, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    for line in csv_file:
        # 空行（用户已注销），无数据，标题行
        if len(line) == 0 or line[0] == '' or line == csv_title:
            continue
        province = loc_dic.get(line[0])
        if province is not None:
            loc_num[loc_range.index(province)] += 1

# print(loc_num)        

# 数据：地名 + 人数
data_pair = []
for z in zip(loc_range, loc_num):
    if z[1] != 0:
        data_pair.append(z)

geo = (
    Geo()
    .add_schema(maptype='china')
    .add('', data_pair)
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(
        # max_ 为图元的最大值，根据数据集设置
        visualmap_opts=opts.VisualMapOpts(max_=60),
        title_opts=opts.TitleOpts(title='友邻地区分布'),
    )
)

# geo.render()
geo.render_notebook()


### 绘制友邻男女广播图

In [None]:
from bisect import bisect_left
from pyecharts import options as opts
from pyecharts.charts import Bar

from settings import status_range

# 男性人数
male_status_num = list(0 for _ in status_range)
# 女性人数
female_status_num = list(0 for _ in status_range)

with open(dataset_file, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    for line in csv_file:
        # 空行（用户已注销），无数据，标题行
        if len(line) == 0 or line[1] == '' or line == csv_title:
            continue
        # 该友邻为男性
        if line[3] == 'M':
            # 查询该友邻的广播数位于那个区间内
            index = bisect_left(status_range, int(line[1]))
            male_status_num[index - 1] += 1
        # 该友邻为女性
        elif line[3] == 'F':
            index = bisect_left(status_range, int(line[1]))
            female_status_num[index - 1] += 1

# print(male_status_num)
# print(female_status_num)

# x 轴显示的区间
label = []
for index in range(1, len(status_range)):
    label.append('{} - {}'.format(str(status_range[index - 1]), str(status_range[index])))
label.append(str(status_range[-1]) + ' +')
# print(label)

bar_stack = (
    Bar()
    .add_xaxis(label)
    .add_yaxis('男', male_status_num, stack='stack1')
    .add_yaxis('女', female_status_num, stack='stack1')
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(title_opts=opts.TitleOpts(title='友邻广播'))
)

# bar_stack.render()
bar_stack.render_notebook()


### 绘制注册时间图

In [None]:
from pyecharts import options as opts
from pyecharts.charts import Pie

from settings import reg_year_range

reg_year_num = list(0 for _ in reg_year_range)

with open(dataset_file, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    for line in csv_file:
        # 空行（用户已注销），无数据，标题行
        if len(line) == 0 or line[2] == '' or line == csv_title:
            continue
        reg_year_num[reg_year_range.index(int(line[2]))] += 1

# print(reg_year_num)

pie = (
    Pie()
    .add('', [list(z) for z in zip(reg_year_range, reg_year_num)])
    .set_global_opts(
        title_opts=opts.TitleOpts(title='友邻注册时间分布图',
                                  pos_top='10%'),
        legend_opts=opts.LegendOpts(orient='vertical',
                                    pos_top='15%',
                                    pos_right='2%')
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter='{b}: {c}'))
)

# pie.render()
pie.render_notebook()


### 绘制观影数据图

In [None]:
from pyecharts import options as opts
from pyecharts.charts import Scatter3D

data = []

with open(dataset_file, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    for line in csv_file:
        # 空行（用户已注销），无数据，标题行
        if len(line) == 0 or line[2] == '' or line == csv_title:
            continue
        if line[4] == '' or line[5] == '' or line[6] == '':
            continue
        data.append((int(line[4]), int(line[5]), int(line[6])))

# x 轴：观影数
# y 轴：观看时间
# z 轴：消费
scatter3d = (
    Scatter3D()
    .add('',
         data,
         xaxis3d_opts=opts.Axis3DOpts(type_='value'),
         yaxis3d_opts=opts.Axis3DOpts( type_='value'),
         zaxis3d_opts=opts.Axis3DOpts( type_='value'))
    .set_global_opts(
        title_opts=opts.TitleOpts('观影数据三维散点图'),
        # max_ 为图元的最大值，根据数据集设置
        visualmap_opts=opts.VisualMapOpts(max_=10000),
    )
)

# scatter3d.render()
scatter3d.render_notebook()


### 绘制友邻常看电影类型分布图

In [None]:
from pyecharts import options as opts
from pyecharts.charts import Bar

from settings import genre_range

genre_num = list(0 for _ in genre_range)

with open(dataset_file, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    for line in csv_file:
        # 空行（用户已注销），标题行
        if len(line) == 0 or line == csv_title:
            continue
        # 读取每位友邻最常观看的三种类型
        if line[10] != '':
            genre_num[genre_range.index(line[10])] += 1
        if line[11] != '':
            genre_num[genre_range.index(line[11])] += 1
        if line[12] != '':
            genre_num[genre_range.index(line[12])] += 1

# print(genre_num)

num = []
label = []
# 筛选出所有友邻最常观看的六种类型
for i in range(6):
    index = genre_num.index(max(genre_num))
    label.append(genre_range[index])
    num.append(genre_num[index])
    genre_num[index] = 0

num.reverse()
label.reverse()

# print(num)
# print(label)

bar = (
    Bar()
    .add_xaxis(label)
    .add_yaxis('', num)
    .reversal_axis()
    .set_series_opts(label_opts=opts.LabelOpts(position='right'))
    .set_global_opts(title_opts=opts.TitleOpts(title='友邻常看电影类型分布图'))
)

# bar.render()
bar.render_notebook()


### 绘制友邻常看电影地区分布图

In [None]:
from pyecharts import options as opts
from pyecharts.charts import Pie

from settings import country_range

total = 0
country_num = list(0 for _ in country_range)

with open(dataset_file, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    for line in csv_file:
        # 空行（用户已注销），标题行
        if len(line) == 0 or line == csv_title:
            continue
        # 读取每位友邻最常观看的两个地区
        if line[8] != '':
            country_num[country_range.index(line[8])] += 1
            total += 1
        if line[9] != '':
            country_num[country_range.index(line[9])] += 1
            total += 1

# print(country_num)

label = []
value = []
# 筛选出所有友邻最常观看的六个地区
for i in range(6):
    num = max(country_num)
    index = country_num.index(num)
    country_num[index] = 0
    label.append(country_range[index])
    value.append(num)

pie_radius = (
    Pie()
    .add(
        '',
        [(label[0], value[0]), ('其他', total - value[0])],
        # 内环半径，外环半径
        radius=['18%', '26%'],
        # x 坐标，y 坐标
        center=['20%', '36%'],
        is_clockwise=False
    )
    .add(
        '',
        [(label[1], value[1]), ('其他', total - value[1])],
        radius=['18%', '26%'],
        center=['50%', '36%'],
        is_clockwise=False
    )
    .add(
        '',
        [(label[2], value[2]), ('其他', total - value[2])],
        radius=['18%', '26%'],
        center=['80%', '36%'],
        is_clockwise=False
    )
    .add(
        '',
        [(label[3], value[3]), ('其他', total - value[3])],
        radius=['18%', '26%'],
        center=['20%', '80%'],
        is_clockwise=False
    )
    .add(
        '',
        [(label[4], value[4]), ('其他', total - value[4])],
        radius=['18%', '26%'],
        center=['50%', '80%'],
        is_clockwise=False
    )
    .add(
        '',
        [(label[5], value[5]), ('其他', total - value[5])],
        radius=['18%', '26%'],
        center=['80%', '80%'],
        is_clockwise=False
    )
    # 颜色，顺序与图示相同，即与数值大小顺序相同
    .set_colors(['lightskyblue', 'whitesmoke', 'lightcoral', 'lightgreen', 'lightskyblue', 'lightcoral', 'lightgreen'])
    .set_global_opts(
        title_opts=opts.TitleOpts(title='友邻常看电影地区分布图',
                                  pos_top='5%'),
        legend_opts=opts.LegendOpts(pos_top='5%'),
    )
    .set_series_opts(label_opts=opts.LabelOpts(formatter='{b}: {c}'))
)

# pie_radius.render()
pie_radius.render_notebook()
