In [37]:
#导入必要的工具包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from plotly import tools
import plotly.express as px
from plotly.offline import init_notebook_mode,iplot,plot
import plotly.figure_factory as ff
import plotly.graph_objs as go

In [60]:
# 导入数据文件
df_train=pd.read_csv("train.csv")
df_test=pd.read_csv("test.csv")

# 查看训练集数据
df_train

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,2020-01-22,0.0,0.0
1,2,,Afghanistan,2020-01-23,0.0,0.0
2,3,,Afghanistan,2020-01-24,0.0,0.0
3,4,,Afghanistan,2020-01-25,0.0,0.0
4,5,,Afghanistan,2020-01-26,0.0,0.0
...,...,...,...,...,...,...
35990,35991,,Zimbabwe,2020-05-11,36.0,4.0
35991,35992,,Zimbabwe,2020-05-12,36.0,4.0
35992,35993,,Zimbabwe,2020-05-13,37.0,4.0
35993,35994,,Zimbabwe,2020-05-14,37.0,4.0


In [61]:
# 查看测试集数据
df_test

Unnamed: 0,ForecastId,Province_State,Country_Region,Date
0,1,,Afghanistan,2020-04-02
1,2,,Afghanistan,2020-04-03
2,3,,Afghanistan,2020-04-04
3,4,,Afghanistan,2020-04-05
4,5,,Afghanistan,2020-04-06
...,...,...,...,...
13454,13455,,Zimbabwe,2020-05-10
13455,13456,,Zimbabwe,2020-05-11
13456,13457,,Zimbabwe,2020-05-12
13457,13458,,Zimbabwe,2020-05-13


# 任务目标
从 train 数据中训练出模型，然后分别预测 test 数据集中，不同的国家在不同的日期中的确诊病例数和死亡病例数。

In [40]:
# 筛选、清洗数据

# 查看缺失值
df_train.isnull().sum()

Id                    0
Province_State    20700
Country_Region        0
Date                  0
ConfirmedCases        0
Fatalities            0
dtype: int64

In [66]:
# “省份”缺失值较多，先用空字符串填充
df_train=df_train.fillna("")
df_train

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,2020-01-22,0.0,0.0
1,2,,Afghanistan,2020-01-23,0.0,0.0
2,3,,Afghanistan,2020-01-24,0.0,0.0
3,4,,Afghanistan,2020-01-25,0.0,0.0
4,5,,Afghanistan,2020-01-26,0.0,0.0
...,...,...,...,...,...,...
35990,35991,,Zimbabwe,2020-05-11,36.0,4.0
35991,35992,,Zimbabwe,2020-05-12,36.0,4.0
35992,35993,,Zimbabwe,2020-05-13,37.0,4.0
35993,35994,,Zimbabwe,2020-05-14,37.0,4.0


In [42]:
# 通过describe函数查看数据概览
df_train.describe()

Unnamed: 0,Id,ConfirmedCases,Fatalities
count,35995.0,35995.0,35995.0
mean,17998.0,3683.508737,243.560217
std,10391.005806,18986.978708,1832.966999
min,1.0,0.0,0.0
25%,8999.5,0.0,0.0
50%,17998.0,19.0,0.0
75%,26996.5,543.0,7.0
max,35995.0,345813.0,33998.0


# 数据分析、可视化

In [67]:

# 查看不同国家最高确诊病例数

# 将数据按“国家”、“日期”维度聚合，并求和，得出各个国家在不同时期的总确诊病例数
df_countries=df_train.groupby(["Country_Region","Date"])["ConfirmedCases"].sum()
# 再次按“国家”维度聚合，并取最大值，得出各个国家的历史最高确诊病例数
df_countries=df_countries.groupby(["Country_Region"]).max()
# 按最高确诊病例数降序排序后，取前20条数据备用
df_countries=df_countries.sort_values(ascending=False).head(20)
df_countries

Country_Region
US                1442653.0
Russia             262843.0
United Kingdom     238004.0
Spain              230183.0
Italy              223885.0
Brazil             220291.0
France             179630.0
Germany            175233.0
Turkey             146457.0
Iran               116635.0
India               85784.0
Peru                84495.0
China               84038.0
Canada              75945.0
Belgium             54644.0
Saudi Arabia        49176.0
Mexico              45032.0
Netherlands         43880.0
Chile               39542.0
Pakistan            38799.0
Name: ConfirmedCases, dtype: float64

In [44]:
# 使用plotly将图表画出来
fig=px.bar(df_countries,x=df_countries.index,y='ConfirmedCases',labels={'x':'Country'},color='ConfirmedCases',color_continuous_scale=px.colors.sequential.Bluered)
fig.update_layout(title_text="国家历史最高确诊数")
fig.show()

# 结论
由上图可得出结论：确诊病例数和国家这一特征密切相关

In [45]:
# 以美国为例,分析确诊病例随着时间的变化趋势

# 首先将数据按“国家”、“日期”维度聚合，得出所有国家确诊病例数随时间的变化趋势
df_country_records=df_train.groupby(["Country_Region","Date"]).sum()
# 过滤出美国的数据,并取确诊数和死亡数这两个关键字段
df_usa_records=df_country_records.loc["US",["ConfirmedCases","Fatalities"]]
# 重置索引（自动添加序号索引），便于画图
df_usa_records=df_usa_records.reset_index()
df_usa_records

Unnamed: 0,Date,ConfirmedCases,Fatalities
0,2020-01-22,0.0,0.0
1,2020-01-23,0.0,0.0
2,2020-01-24,0.0,0.0
3,2020-01-25,0.0,0.0
4,2020-01-26,0.0,0.0
...,...,...,...
110,2020-05-11,1347710.0,80677.0
111,2020-05-12,1369403.0,82371.0
112,2020-05-13,1390235.0,84114.0
113,2020-05-14,1417603.0,85893.0


In [46]:
# 使用plotly画图
fig=px.bar(df_usa_records,x="Date",y="ConfirmedCases",color="ConfirmedCases",color_continuous_scale=px.colors.sequential.Magma)
fig.update_layout(title_text='美国随时间确诊病例数')
fig.show()

In [47]:
# 再来看一下美国的死亡病例数随时间的变化
fig=px.bar(df_usa_records,x="Date",y="Fatalities",color="Fatalities",color_continuous_scale=px.colors.sequential.Magma)
fig.update_layout(title_text='美国随时间死亡病例数')
fig.show()

In [49]:
# 再随机抽样另一个国家的数据情况，以巴西为例

# 巴西确诊病例数情况
df_brz_records=df_country_records.loc["Brazil",["ConfirmedCases","Fatalities"]]
df_brz_records=df_brz_records.reset_index()
fig=px.bar(df_brz_records,x="Date",y="ConfirmedCases",color="ConfirmedCases",color_continuous_scale=px.colors.sequential.Magma)
fig.update_layout(title_text='巴西随时间确诊病例数')
fig.show()


In [50]:
# 巴西死亡病例情况
fig=px.bar(df_brz_records,x="Date",y="Fatalities",color="Fatalities",color_continuous_scale=px.colors.sequential.Magma)
fig.update_layout(title_text='巴西随时间死亡病例数')
fig.show()

# 结论
从上图中可以看出，巴西的确诊数和死亡数与美国类似，都是随着时间不断攀升；同时，美国和巴西的确诊病例数增速也有所不同，说明国家和日期这两个维度都是重要参考指标。


# 特征工程
将国家和日期这两个字段转换成数字，方便建立回归模型



In [68]:
# 处理日期数据
# 将日期转化为三个数字：年、月、日，并分别新建字段

# 首先将Date字段转换为datetime类型
df_train.Date=pd.to_datetime(df_train.Date)
# apply与lambda配合使用，分别取出日期中的年、月、日，并新建字段
df_train["Year"]=df_train.Date.apply(lambda l:l.year)
df_train["Month"]=df_train.Date.apply(lambda l:l.month)
df_train["Day"]=df_train.Date.apply(lambda l:l.day)
df_train

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,Year,Month,Day
0,1,,Afghanistan,2020-01-22,0.0,0.0,2020,1,22
1,2,,Afghanistan,2020-01-23,0.0,0.0,2020,1,23
2,3,,Afghanistan,2020-01-24,0.0,0.0,2020,1,24
3,4,,Afghanistan,2020-01-25,0.0,0.0,2020,1,25
4,5,,Afghanistan,2020-01-26,0.0,0.0,2020,1,26
...,...,...,...,...,...,...,...,...,...
35990,35991,,Zimbabwe,2020-05-11,36.0,4.0,2020,5,11
35991,35992,,Zimbabwe,2020-05-12,36.0,4.0,2020,5,12
35992,35993,,Zimbabwe,2020-05-13,37.0,4.0,2020,5,13
35993,35994,,Zimbabwe,2020-05-14,37.0,4.0,2020,5,14


In [69]:
# 处理国家的特征

'''
从筛选数据的环节我们知道，省份处理有较多的缺失。但仍然有大概 1/3 的记录是有值的，所以我们也不能直接抛弃这个字段，但如果直接将其作为一个特征的话，可能会影响模型的结果。
所以我们将省份直接拼接到国家的维度，将国家+省份整体作为一个特征。这样就能尽可能地使用省份信息，又能避免太多空值给模型造成的影响。
'''
# 将“国家”和“省份”拼接在一起
df_train["Country_Region"]=df_train["Country_Region"]+df_train["Province_State"]
# 查看拼接后的数据分布
df_train["Country_Region"].value_counts()

Belarus                     115
ChinaShanghai               115
AustraliaSouth Australia    115
Bangladesh                  115
Croatia                     115
                           ... 
USGuam                      115
Spain                       115
Qatar                       115
Finland                     115
Guinea                      115
Name: Country_Region, Length: 313, dtype: int64

In [70]:
# 使用sklearn工具包中的LabelEncoder对象，将"Country_Region"这一字段转换为数字

from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df_train["Country_Region"]=encoder.fit_transform(df_train["Country_Region"])
df_train

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,Year,Month,Day
0,1,,0,2020-01-22,0.0,0.0,2020,1,22
1,2,,0,2020-01-23,0.0,0.0,2020,1,23
2,3,,0,2020-01-24,0.0,0.0,2020,1,24
3,4,,0,2020-01-25,0.0,0.0,2020,1,25
4,5,,0,2020-01-26,0.0,0.0,2020,1,26
...,...,...,...,...,...,...,...,...,...
35990,35991,,312,2020-05-11,36.0,4.0,2020,5,11
35991,35992,,312,2020-05-12,36.0,4.0,2020,5,12
35992,35993,,312,2020-05-13,37.0,4.0,2020,5,13
35993,35994,,312,2020-05-14,37.0,4.0,2020,5,14


In [54]:
# 抽取训练特征和目标特征

df_train_final=df_train[["Country_Region","Year","Month","Day"]]
labels=df_train.ConfirmedCases
df_train_final

Unnamed: 0,Country_Region,Year,Month,Day
0,0,2020,1,22
1,0,2020,1,23
2,0,2020,1,24
3,0,2020,1,25
4,0,2020,1,26
...,...,...,...,...
35990,312,2020,5,11
35991,312,2020,5,12
35992,312,2020,5,13
35993,312,2020,5,14


In [11]:
labels

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
35990    36.0
35991    36.0
35992    37.0
35993    37.0
35994    42.0
Name: ConfirmedCases, Length: 35995, dtype: float64

In [55]:
# 模型训练
# 国家和确诊病例数之间是非线性关系，因此这里使用xgboost建立非线性模型
# 导入xgboost
from xgboost import XGBRegressor
# 创建xgboost,并配置参数 n_estimators:迭代次数
xgb=XGBRegressor(n_estimators=2000,random_state=0,max_depth=27)
xgb.fit(df_train_final,labels)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=27,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=2000, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [62]:
# 获取结论
# 首先整理test数据集的基本特征
df_test=df_test.fillna("")
df_test.Date=pd.to_datetime(df_test.Date)
df_test["Year"]=df_test.Date.apply(lambda l:l.year)
df_test["Month"]=df_test.Date.apply(lambda l:l.month)
df_test["Day"]=df_test.Date.apply(lambda l:l.day)
df_test["Country_Region"]=df_test["Country_Region"]+df_test["Province_State"]
df_test["Country_Region"]=encoder.fit_transform(df_test["Country_Region"])
df_test

Unnamed: 0,ForecastId,Province_State,Country_Region,Date,Year,Month,Day
0,1,,0,2020-04-02,2020,4,2
1,2,,0,2020-04-03,2020,4,3
2,3,,0,2020-04-04,2020,4,4
3,4,,0,2020-04-05,2020,4,5
4,5,,0,2020-04-06,2020,4,6
...,...,...,...,...,...,...,...
13454,13455,,312,2020-05-10,2020,5,10
13455,13456,,312,2020-05-11,2020,5,11
13456,13457,,312,2020-05-12,2020,5,12
13457,13458,,312,2020-05-13,2020,5,13


In [64]:
# 预测test中的确诊病例数，并添加到表格中
df_test_final=df_test[["Country_Region","Year","Month","Day"]]
df_test["Predict_confirm"]=xgb.predict(df_test_final)
df_test

Unnamed: 0,ForecastId,Province_State,Country_Region,Date,Year,Month,Day,Predict_confirm
0,1,,0,2020-04-02,2020,4,2,272.999054
1,2,,0,2020-04-03,2020,4,3,281.000397
2,3,,0,2020-04-04,2020,4,4,299.000580
3,4,,0,2020-04-05,2020,4,5,349.000519
4,5,,0,2020-04-06,2020,4,6,367.000305
...,...,...,...,...,...,...,...,...
13454,13455,,312,2020-05-10,2020,5,10,36.000092
13455,13456,,312,2020-05-11,2020,5,11,35.999012
13456,13457,,312,2020-05-12,2020,5,12,36.000759
13457,13458,,312,2020-05-13,2020,5,13,36.999180


In [71]:
# 查看训练数据，验证预测
df_train[df_train.Date>="2020-04-02"]

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,Year,Month,Day
71,72,,0,2020-04-02,273.0,6.0,2020,4,2
72,73,,0,2020-04-03,281.0,6.0,2020,4,3
73,74,,0,2020-04-04,299.0,7.0,2020,4,4
74,75,,0,2020-04-05,349.0,7.0,2020,4,5
75,76,,0,2020-04-06,367.0,11.0,2020,4,6
...,...,...,...,...,...,...,...,...,...
35990,35991,,312,2020-05-11,36.0,4.0,2020,5,11
35991,35992,,312,2020-05-12,36.0,4.0,2020,5,12
35992,35993,,312,2020-05-13,37.0,4.0,2020,5,13
35993,35994,,312,2020-05-14,37.0,4.0,2020,5,14
