## Part2 数据分析

<font color='red'>*若已有数据集可以直接从此开始*</font>

此`notrbook`基于`plotly`

### 导入相关库和设置

In [None]:
import csv
import numpy as np
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

from settings import username
from settings import api_key
from settings import dataset_file
from settings import csv_title

# 设置 plotly
plotly.tools.set_credentials_file(username=username, api_key=api_key)


### 绘制友邻分布地图

In [None]:
from settings import loc_lat
from settings import loc_lon
from settings import mapbox_access_token

# 名称
loc = []
# 人数
num = []
# 纬度
lat = []
# 经度
lon = []

with open(dataset_file, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    for line in csv_file:
        # 空行（用户已注销），无数据，标题行
        if len(line) == 0 or line[0] == '' or line == csv_title:
            continue
        # 无经纬度数据
        if loc_lat.get(line[0]) is None:
            continue
        try:
            # 若此地区已加入 loc 数组
            index = loc.index(line[0])
            num[index] += 1
        except ValueError:
            # 加入新地区
            loc.append(line[0])
            num.append(1)
            lat.append(loc_lat[line[0]])
            lon.append(loc_lon[line[0]])

# print(loc)
# print(num)            

# 鼠标悬停时显示的文字
text = []
for i in range(len(loc)):
    text.append(str(loc[i]) + '   ' + str(num[i]))

data = [
    go.Scattermapbox(
        lat=lat,
        lon=lon,
        mode='markers',
        marker=go.scattermapbox.Marker(
            # 标志的大小
            size=9
        ),
        text=text,
    )
]

layout = go.Layout(
    autosize=True,
    hovermode='closest',
    height=800,
    title='友邻地区分布',
    mapbox=go.layout.Mapbox(
        # 必须要有正确的 access token 才能使用
        accesstoken=mapbox_access_token,
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=34,
            lon=108
        ),
        pitch=0,
        zoom=3.5,
    ),
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='neighbor_distribution_map')


### 绘制友邻男女广播图

In [None]:
from math import ceil
from bisect import bisect_left

from settings import status_range

# 男性人数
male_status_num = np.array(list(0 for _ in status_range))
# 女性人数
female_status_num = np.array(list(0 for _ in status_range))

with open(dataset_file, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    for line in csv_file:
        # 空行（用户已注销），无数据，标题行
        if len(line) == 0 or line[1] == '' or line == csv_title:
            continue
        # 该友邻为男性
        if line[3] == 'M':
            # 查询该友邻的广播数位于那个区间内
            index = bisect_left(status_range, int(line[1]))
            male_status_num[index - 1] += 1
        # 该友邻为女性
        elif line[3] == 'F':
            index = bisect_left(status_range, int(line[1]))
            female_status_num[index - 1] -= 1

# print(male_status_num)
# print(female_status_num)

# 最大的区间人数
length = max(max(male_status_num), -max(female_status_num))
# print(length)

# x 轴的边界设置为 30 的倍数
boundary = 30 * ceil(length / 30)
# print(boundary)

# y 轴显示的区间
label = []
for index in range(1, len(status_range)):
    label.append('{} - {}'.format(str(status_range[index - 1]), str(status_range[index])))
label.append(str(status_range[-1]) + ' +')
# print(label)

layout = go.Layout(title='友邻广播',
                   yaxis=go.layout.YAxis(title='广播数量'),
                   xaxis=go.layout.XAxis(
                       range=[-boundary, boundary],
                       # 绘图时的数值
                       tickvals=list(val for val in range(20 - boundary, boundary, 20)),
                       # 显示时的数值（正值）
                       ticktext=list(abs(text) for text in range(20 - boundary, boundary, 20)),
                       title='人数'),
                   barmode='overlay',
                   bargap=0.1)

data = [go.Bar(y=label,
               x=male_status_num,
               orientation='h',
               name='男',
               hoverinfo='x',
               marker=dict(color='lightskyblue'),
               opacity=0.8
               ),
        go.Bar(y=label,
               x=female_status_num,
               orientation='h',
               name='女',
               text=-1 * female_status_num.astype('int'),
               hoverinfo='text',
               marker=dict(color='gold'),
               opacity=0.8
               )]

py.iplot(dict(data=data, layout=layout), filename='status_pyramid_chart')


### 绘制注册时间图

In [None]:
from settings import reg_year_range

reg_year_num = np.array(list(0 for _ in reg_year_range))

with open(dataset_file, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    for line in csv_file:
        # 空行（用户已注销），无数据，标题行
        if len(line) == 0 or line[2] == '' or line == csv_title:
            continue
        reg_year_num[reg_year_range.index(int(line[2]))] += 1

# print(reg_year_num)

trace = go.Pie(
    labels=reg_year_range,
    values=reg_year_num,
    textinfo='label',
    marker=dict(line=dict(color='black', width=1))
)

py.iplot([trace], filename='reg_year_pie_chart')


### 绘制观影数据图

In [None]:
import cufflinks as cf
import pandas as pd

cf.set_config_file(offline=False, world_readable=True)

df = pd.read_csv(dataset_file).dropna()

# x 轴：观看时间
# y 轴：消费
# 大小：观影数
df.iplot(kind='bubble', x=csv_title[5], y=csv_title[6], size=csv_title[4], text=csv_title[4],
         xTitle='观看时间', yTitle='消费', colorscale='blues', filename='movie_bubble_chart')


### 绘制友邻常看电影类型分布图

In [None]:
from settings import genre_range

genre_num = np.array(list(0 for _ in genre_range))

with open(dataset_file, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    for line in csv_file:
        # 空行（用户已注销），标题行
        if len(line) == 0 or line == csv_title:
            continue
        # 读取每位友邻最常观看的三种类型
        if line[10] != '':
            genre_num[genre_range.index(line[10])] += 1
        if line[11] != '':
            genre_num[genre_range.index(line[11])] += 1
        if line[12] != '':
            genre_num[genre_range.index(line[12])] += 1

# print(genre_num)
            
num = []
label = []
# 筛选出所有友邻最常观看的六种类型
for i in range(6):
    index = np.argmax(genre_num)
    label.append(genre_range[index])
    num.append(genre_num[index])
    genre_num[index] = 0

num.reverse()
label.reverse()

# print(num)
# print(label)

data = [go.Bar(
    x=num,
    y=label,
    text=num,
    textposition='auto',
    orientation='h',
    marker=dict(color='gold'),
    opacity=0.8
)]

py.iplot(data, filename='genre_horizontal_bar_chart')


### 绘制友邻常看电影地区分布图

In [None]:
from settings import country_range

total = 0
country_num = np.array(list(0 for _ in country_range))

with open(dataset_file, 'r', encoding='utf-8') as file:
    csv_file = csv.reader(file)
    for line in csv_file:
        # 空行（用户已注销），标题行
        if len(line) == 0 or line == csv_title:
            continue
        # 读取每位友邻最常观看的两个地区
        if line[8] != '':
            country_num[country_range.index(line[8])] += 1
            total += 1
        if line[9] != '':
            country_num[country_range.index(line[9])] += 1
            total += 1

# print(country_num)

# 饼图 x 坐标
domain_x = ([0, 0.24], [0.38, 0.62], [0.76, 1], [0, 0.24], [0.38, 0.62], [0.76, 1])
# 饼图 y 坐标
domain_y = ([0.6, 1], [0.6, 1], [0.6, 1], [0, 0.4], [0, 0.4], [0, 0.4])
colors = ('lightskyblue', 'lightcoral', 'lightgreen', 'lightskyblue', 'lightcoral', 'lightgreen')
# 文字 x 坐标
x = (0.09, 0.5, 0.91, 0.09, 0.5, 0.91)
# 文字 y 坐标
y = (0.84, 0.84, 0.84, 0.16, 0.16, 0.16)

# 绘图数据
data = []
# 饼图中央显示的文字
annotations = []
# 筛选出所有友邻最常观看的六个地区
for i in range(6):
    index = np.argmax(country_num)
    num = country_num[index]
    country_num[index] = 0

    data.append({
        'labels': [country_range[index], '其他'],
        'values': [num, total - num],
        'type': 'pie',
        'marker': {'colors': [colors[i], 'whitesmoke']},
        'domain': {'x': domain_x[i], 'y': domain_y[i]},
        'hoverinfo': 'label+percent',
        'hole': .75,
    })

    annotations.append({
        'font': {'size': 16},
        'showarrow': False,
        'text': country_range[index],
        'x': x[i],
        'y': y[i]
    })

fig = {
    'data': data,
    'layout': {
        'title': '友邻常看电影地区分布图',
        'grid': {'rows': 2, 'columns': 3},
        'annotations': annotations
    }
}

py.iplot(fig, filename='country_pie_chart')
