In [250]:
import numpy as np
import pandas as pd
from sklearn import svm, model_selection, preprocessing

In [251]:
filename = './data/weatherAUS.csv'
weather = pd.read_csv(filename)
weather.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [252]:
# X (142193, 23)
X, y = weather.iloc[:, :-1], weather.iloc[:, -1]

In [253]:
# 查看标签的值 一看结果就是二分类
np.unique(y)

array(['No', 'Yes'], dtype=object)

In [254]:
y.value_counts()

No     110316
Yes     31877
Name: RainTomorrow, dtype: int64

In [255]:
# 统计y中空值所占的比例
y.isnull().mean()

0.0

In [256]:
# 查看数据类型
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 23 columns):
Date             142193 non-null object
Location         142193 non-null object
MinTemp          141556 non-null float64
MaxTemp          141871 non-null float64
Rainfall         140787 non-null float64
Evaporation      81350 non-null float64
Sunshine         74377 non-null float64
WindGustDir      132863 non-null object
WindGustSpeed    132923 non-null float64
WindDir9am       132180 non-null object
WindDir3pm       138415 non-null object
WindSpeed9am     140845 non-null float64
WindSpeed3pm     139563 non-null float64
Humidity9am      140419 non-null float64
Humidity3pm      138583 non-null float64
Pressure9am      128179 non-null float64
Pressure3pm      128212 non-null float64
Cloud9am         88536 non-null float64
Cloud3pm         85099 non-null float64
Temp9am          141289 non-null float64
Temp3pm          139467 non-null float64
RainToday        140787 non-null obje

In [257]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=66)
# 这样随机分完训练集和测试集后 数据的索引都不是按照一定顺序排列 是乱序的 需要重新设置索引
for i in [X_train, X_test, y_train, y_test]:
    i.index = range(i.shape[0])

In [258]:
X_train.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM
0,2011-06-25,Albany,10.5,17.6,18.6,4.2,7.8,,,NW,...,82.0,68.0,1012.3,1015.0,2.0,2.0,12.0,16.2,Yes,0.0
1,2015-12-05,Darwin,26.5,33.9,0.0,5.2,4.3,SSE,35.0,NNW,...,65.0,59.0,1012.2,1009.1,7.0,7.0,31.0,32.6,No,0.0
2,2016-02-23,Woomera,26.6,43.4,0.0,15.6,10.0,SSE,54.0,NW,...,18.0,11.0,1012.9,1009.8,0.0,1.0,33.3,42.0,No,0.0
3,2008-07-16,Sydney,8.9,18.0,0.0,2.0,9.6,,,WNW,...,81.0,50.0,1022.2,1019.1,1.0,1.0,10.8,17.4,No,0.0
4,2016-01-14,SydneyAirport,22.9,40.6,0.0,8.4,,WSW,120.0,WNW,...,56.0,34.0,1011.5,1007.4,1.0,7.0,30.5,36.1,No,27.4


In [260]:
X_test.shape


(42658, 23)

In [198]:
# 查看训练数据和测试数据是否存在样本不均衡问题  
#由下面的结果数据可以看出数据存在轻微的不均衡问题 正样本：负样本大概为1：3.5
y_train.value_counts()

No     77282
Yes    22253
Name: RainTomorrow, dtype: int64

In [261]:
y_test.value_counts()

No     33034
Yes     9624
Name: RainTomorrow, dtype: int64

In [262]:
encoder = preprocessing.LabelEncoder().fit(y_train)
# 将标签LabelEncoder进行编码 变成机器能识别的0和1
y_train = pd.DataFrame(encoder.transform(y_train))
y_test = pd.DataFrame(encoder.transform(y_test))

In [263]:
# 熟悉特征的 查看是否存在异常值  由下表可知Cloud9am有个异常值 0代表晴天 8代表完全阴天
X_train.describe([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,99%,max
MinTemp,99084.0,12.186187,6.396869,-8.2,-1.8,1.8,4.0,7.6,12.0,16.9,20.8,25.8,33.9
MaxTemp,99304.0,23.231387,7.102537,-4.8,9.2,12.9,14.5,18.0,22.6,28.2,32.9,40.1,48.1
Rainfall,98545.0,2.318329,8.377313,0.0,0.0,0.0,0.0,0.0,0.0,0.6,6.0,36.8,371.0
Evaporation,56713.0,5.468526,4.14818,0.0,0.4,1.0,1.4,2.6,4.8,7.4,10.2,18.4,86.2
Sunshine,51801.0,7.60804,3.779142,0.0,0.0,0.3,1.5,4.9,8.4,10.6,12.0,13.4,14.3
WindGustSpeed,93004.0,39.98628,13.570777,7.0,15.0,20.0,24.0,31.0,39.0,48.0,57.0,80.0,135.0
WindSpeed9am,98591.0,14.00782,8.876563,0.0,0.0,0.0,4.0,7.0,13.0,19.0,26.0,39.0,130.0
WindSpeed3pm,97699.0,18.612565,8.803212,0.0,2.0,6.0,9.0,13.0,19.0,24.0,30.0,43.0,87.0
Humidity9am,98271.0,68.78194,19.047474,1.0,17.0,34.0,44.0,57.0,70.0,83.0,94.0,100.0,100.0
Humidity3pm,96985.0,51.408857,20.792846,0.0,9.0,17.0,23.0,36.0,52.0,65.0,79.0,98.0,100.0


In [265]:
"""
对于异常值的处理方法：
1.如果你发现异常值，首先要观察这个异常值的评率。如果只出现了一次，多半是输入错误造成的，直接把异常值删掉即可
如果异常值出现多次，去跟业务人员沟通，可能是某种特殊表示，如果是认为造成的错误，异常值留着也没用，只要数据量不是很大
直接删掉即可
如果异常值站到你数据量的10%以上，就不能轻易删除。可以考虑把异常值替换成非异常值但是非干扰项，
比如说用0，均值，众数来替换异常值
"""
# 查看异常值
X_train.loc[X_train.loc[:, 'Cloud9am'] == 9, 'Cloud9am'].count()

1

In [266]:
X_train.loc[X_train.loc[:, 'Cloud3pm'] == 9, 'Cloud3pm'].count()

1

In [267]:
X_train.loc[X_train.loc[:, 'Cloud3pm'] == 9, 'Cloud3pm'].index

Int64Index([38327], dtype='int64')

In [269]:
# 由上结果可知，属于少数异常值 直接删掉就可以了
# 先删除训练集上的异常值
cloud9am_index = pd.Series(X_train.loc[X_train.loc[:, 'Cloud9am'] == 9, 'Cloud9am'].index)
cloud3pm_index = pd.Series(X_train.loc[X_train.loc[:, 'Cloud3pm'] == 9, 'Cloud3pm'].index)

drop_index = pd.concat([cloud9am_index, cloud3pm_index], ignore_index=True)

X_train = X_train.drop(index=drop_index)
y_train = y_train.drop(index=drop_index)

# 在删除测试集上的异常值
cloud9am_index = pd.Series(X_test.loc[X_test.loc[:, 'Cloud9am'] == 9, 'Cloud9am'].index)
cloud3pm_index = pd.Series(X_test.loc[X_test.loc[:, 'Cloud3pm'] == 9, 'Cloud3pm'].index)

drop_index = pd.concat([cloud9am_index, cloud3pm_index], ignore_index=True)

X_test = X_test.drop(index=drop_index)
y_test = y_test.drop(index=drop_index)

In [270]:
# 再次验证是否删除干净
# X_train.loc[X_train.loc[:, 'Cloud9am'] == 9, 'Cloud9am'].count()
# X_train.loc[X_train.loc[:, 'Cloud3pm'] == 9, 'Cloud3pm'].count()
# X_test.loc[X_test.loc[:, 'Cloud9am'] == 9, 'Cloud9am'].count()
X_test.loc[X_test.loc[:, 'Cloud3pm'] == 9, 'Cloud3pm'].count()

0

In [272]:
# 接下来开始处理日期
"""
一种思路是认为日期度天气没什么影响，可以直接删掉
另外一种思路可以从两个角度分析：1. 昨天的天气可能影响今天的天气，今天的天气可能影响明天的天气，也就是说随着日期的变化
样本是会受到上一个样本的影响的，但是对于算法来说是无法捕捉样本与样本之间的影响的，算法只能捕捉到每个特征与标签的关系
2.既然算法是处理列与列之间的关系，那我们是否把‘今天的天气会影响明天的天气’这个指标转化为一个特征
"""
# 我们采取的第二种思路， 首先要查看一下降雨量的分布
X_train['Rainfall'].describe()

count    98543.000000
mean         2.318329
std          8.377393
min          0.000000
25%          0.000000
50%          0.000000
75%          0.600000
max        371.000000
Name: Rainfall, dtype: float64

In [273]:
# 因为75%的数据的降水量都为0.75 那我们可以把大于1的Rainfall标记为今天下雨，小于的标记为无雨
X_train.loc[X_train['Rainfall'] >= 1, 'RainToday'] = 'Yes'
X_train.loc[X_train['Rainfall'] >= 1, 'RainToday'] = 'No'
X_train.loc[X_train['Rainfall'] == np.nan, 'RainToday'] = np.nan

# 同样的方法处理测试数据集
X_test.loc[X_test['Rainfall'] >= 1, 'RainToday'] = 'Yes'
X_test.loc[X_test['Rainfall'] >= 1, 'RainToday'] = 'No'
X_test.loc[X_test['Rainfall'] == np.nan, 'RainToday'] = np.nan


In [274]:
X_train.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM
0,2011-06-25,Albany,10.5,17.6,18.6,4.2,7.8,,,NW,...,82.0,68.0,1012.3,1015.0,2.0,2.0,12.0,16.2,No,0.0
1,2015-12-05,Darwin,26.5,33.9,0.0,5.2,4.3,SSE,35.0,NNW,...,65.0,59.0,1012.2,1009.1,7.0,7.0,31.0,32.6,No,0.0
2,2016-02-23,Woomera,26.6,43.4,0.0,15.6,10.0,SSE,54.0,NW,...,18.0,11.0,1012.9,1009.8,0.0,1.0,33.3,42.0,No,0.0
3,2008-07-16,Sydney,8.9,18.0,0.0,2.0,9.6,,,WNW,...,81.0,50.0,1022.2,1019.1,1.0,1.0,10.8,17.4,No,0.0
4,2016-01-14,SydneyAirport,22.9,40.6,0.0,8.4,,WSW,120.0,WNW,...,56.0,34.0,1011.5,1007.4,1.0,7.0,30.5,36.1,No,27.4


In [276]:
# 接着处理日期 
# 现在日期还不能删除，因为夏季的月份里比冬天的月份里更容易下雨，我们可以把日期中月份提取出来
X_train['Date'] = X_train['Date'].apply(lambda x: int(x.split('-')[1]))


In [277]:
# 接着处理测试数据集
X_test['Date'] = X_test['Date'].apply(lambda x: int(x.split('-')[1]))

# 修改特征名称 将Date修改为Month
X_train = X_train.rename(columns={'Date' : 'Month'})
X_test = X_test.rename(columns={'Date' : 'Month'})

In [278]:
X_test.shape

(42657, 23)

In [279]:
# 日期处理完后 接着要处理Location
# 我们的处理思路 把我们的地点转化为气候，每个气候下的降雨情况是相似的
# 我们需要获取澳大利亚所有的城市以及经纬度，以及城市的气候
city_all = pd.read_csv('./data/cityll.csv', index_col=0)
city_climate = pd.read_csv('./data/Cityclimate.csv')

In [280]:
city_all

Unnamed: 0,City,Latitude,Longitude,Latitudedir,Longitudedir
0,Adelaide,34.9285°,138.6007°,"S,",E
1,Albany,35.0275°,117.8840°,"S,",E
2,Albury,36.0737°,146.9135°,"S,",E
3,Wodonga,36.1241°,146.8818°,"S,",E
4,AliceSprings,23.6980°,133.8807°,"S,",E
5,Amata,26.1509°,131.1467°,"S,",E
6,Ballarat,37.5622°,143.8503°,"S,",E
7,Bathurst,33.4193°,149.5775°,"S,",E
8,Birdsville,25.8989°,139.3517°,"S,",E
9,Borroloola,16.0703°,136.3072°,"S,",E


In [281]:
city_climate

Unnamed: 0,City,Climate
0,Adelaide,Warm temperate
1,Albany,Mild temperate
2,Albury,"Hot dry summer, cool winter"
3,Wodonga,"Hot dry summer, cool winter"
4,AliceSprings,"Hot dry summer, warm winter"
5,Amata,"Hot dry summer, cool winter"
6,Ballarat,Cool temperate
7,Bathurst,Cool temperate
8,Birdsville,"Hot dry summer, warm winter"
9,Borroloola,"High humidity summer, warm winter"


In [282]:
# 接下来我们需要做的是处理经纬度 然后把这两张表合并起来
city_all['Latitudenum'] = city_all['Latitude'].apply(lambda x: float(x[:-1]))
city_all['Longitudenum'] = city_all['Longitude'].apply(lambda x: float(x[:-1]))


In [283]:
# 因为澳大利亚属于南半球，Latitudedir和Longitudedir都一致，就没有实际意义了 可以直接舍弃
# city_all['Longitudedir'].unique()
city_all = city_all.iloc[:, [0, 5, 6]]
city_all

Unnamed: 0,City,Latitudenum,Longitudenum
0,Adelaide,34.9285,138.6007
1,Albany,35.0275,117.8840
2,Albury,36.0737,146.9135
3,Wodonga,36.1241,146.8818
4,AliceSprings,23.6980,133.8807
5,Amata,26.1509,131.1467
6,Ballarat,37.5622,143.8503
7,Bathurst,33.4193,149.5775
8,Birdsville,25.8989,139.3517
9,Borroloola,16.0703,136.3072


In [284]:
# True    100 这两个表的city完全一样
#(city_all['City'] == city_climate['City']).value_counts()
city_all.loc[:, 'Climate'] = city_climate.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [285]:
# 读取爬虫抓取的 训练集上城市的经纬度
sample_city = pd.read_csv('./data/samplecity.csv', index_col=0)
sample_city['Latitudenum'] = sample_city['Latitude'].apply(lambda x: float(x[:-1]))
sample_city['Longitudenum'] = sample_city['Longitude'].apply(lambda x: float(x[:-1]))
sample_city = sample_city.iloc[:, [0, 5, 6]]
sample_city.head()

Unnamed: 0,City,Latitudenum,Longitudenum
0,Canberra,35.2809,149.13
1,Sydney,33.8688,151.2093
2,Perth,31.9505,115.8605
3,Darwin,12.4634,130.8456
4,Hobart,42.8821,147.3272


In [286]:
# 地理上，两个地点之间的距离 R为地球半径 起始点A(sLatA, sLonA) 结束点B(eLatB, eLonB)
#Distance = R*arccos(sin(sLatA)*sin(eLatB) + cos(sLatA)*cos(eLatB)*cos(sLonA-eLonB))
# 我们现在有了澳大利亚主要城市的经纬度和对应的气候，我们也有样本的地点对应的经纬度，接下来我们开始计算我们的样本点
# 到澳大利亚主要城市的距离，距离我们样本点最近的那个澳大利亚城市的气候，就是我们样本点的气候，这个有点类似KNN
# 首先我们要把角度转变为弧度
import math 
city_all.loc[:, 'sLat'] = city_all.iloc[:, 1].apply(lambda x: math.radians(x))
city_all.loc[:, 'sLon'] = city_all.iloc[:, 2].apply(lambda x: math.radians(x))

sample_city.loc[:, 'eLat'] = sample_city.loc[:, 'Latitudenum'].apply(lambda x: math.radians(x))
sample_city.loc[:, 'eLon'] = sample_city.loc[:, 'Longitudenum'].apply(lambda x: math.radians(x))

for i in range(sample_city.shape[0]):
    # 计算sample_city样本中 每一个样本到所有主要城市的距离，由此来推测训练集中城市的气候
    sLat = city_all.loc[:, 'sLat']
    sLon = city_all.loc[:, 'sLon']
    eLat = sample_city.loc[i, 'eLat']
    eLon = sample_city.loc[i, 'eLon']
    dist = 6371.01 * np.arccos(np.sin(sLat) * np.sin(eLat) + 
                            np.cos(sLat) * np.cos(eLat) * np.cos(sLon.values - eLon))
    # 按照从小到大排序 取出第一个的下标
    city_index = np.argsort(dist)[0]
    sample_city.loc[i, 'closest_city'] = city_all.loc[city_index, 'City']
    sample_city.loc[i, 'Climate'] = city_all.loc[city_index, 'Climate']
    
sample_city.head()
    



Unnamed: 0,City,Latitudenum,Longitudenum,eLat,eLon,closest_city,Climate
0,Canberra,35.2809,149.13,0.615768,2.60281,Canberra,Cool temperate
1,Sydney,33.8688,151.2093,0.591122,2.6391,Sydney,Warm temperate
2,Perth,31.9505,115.8605,0.557641,2.022147,Perth,Warm temperate
3,Darwin,12.4634,130.8456,0.217527,2.283687,Darwin,"High humidity summer, warm winter"
4,Hobart,42.8821,147.3272,0.748434,2.571345,Hobart,Cool temperate


In [287]:
# 取出城市和对应的气候
locafinal = sample_city.iloc[:, [0, -1]]
locafinal.columns = ['Location', 'Climate']
# 在这里设定Location为索引，是为了之后进行map的匹配
locafinal = locafinal.set_index(keys='Location')
locafinal.to_csv('./data/samplelocation.csv')
locafinal.head()

Unnamed: 0_level_0,Climate
Location,Unnamed: 1_level_1
Canberra,Cool temperate
Sydney,Warm temperate
Perth,Warm temperate
Darwin,"High humidity summer, warm winter"
Hobart,Cool temperate


In [222]:
# 再次查看训练集
X_train.head()

Unnamed: 0,Month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM
0,6,Albany,10.5,17.6,18.6,4.2,7.8,,,NW,...,82.0,68.0,1012.3,1015.0,2.0,2.0,12.0,16.2,No,0.0
1,12,Darwin,26.5,33.9,0.0,5.2,4.3,SSE,35.0,NNW,...,65.0,59.0,1012.2,1009.1,7.0,7.0,31.0,32.6,No,0.0
2,2,Woomera,26.6,43.4,0.0,15.6,10.0,SSE,54.0,NW,...,18.0,11.0,1012.9,1009.8,0.0,1.0,33.3,42.0,No,0.0
3,7,Sydney,8.9,18.0,0.0,2.0,9.6,,,WNW,...,81.0,50.0,1022.2,1019.1,1.0,1.0,10.8,17.4,No,0.0
4,1,SydneyAirport,22.9,40.6,0.0,8.4,,WSW,120.0,WNW,...,56.0,34.0,1011.5,1007.4,1.0,7.0,30.5,36.1,No,27.4


In [289]:
# 接下来就是将Location的内容替换掉，并且保证匹配的气候字符串不含有逗号，气候两遍不含空格
# 我们使用re这个模块来消除逗号
# re.sub(希望要替换的值， 希望被替换的值， 要操作的字符)
# x。strip（）去掉空格的函数
import re
# 将Location 映射成气候
X_train['Location'] = X_train['Location'].map(locafinal.iloc[:, 0])
X_train.head()


Unnamed: 0,Month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM
0,6,Mild temperate,10.5,17.6,18.6,4.2,7.8,,,NW,...,82.0,68.0,1012.3,1015.0,2.0,2.0,12.0,16.2,No,0.0
1,12,"High humidity summer, warm winter",26.5,33.9,0.0,5.2,4.3,SSE,35.0,NNW,...,65.0,59.0,1012.2,1009.1,7.0,7.0,31.0,32.6,No,0.0
2,2,Warm temperate,26.6,43.4,0.0,15.6,10.0,SSE,54.0,NW,...,18.0,11.0,1012.9,1009.8,0.0,1.0,33.3,42.0,No,0.0
3,7,Warm temperate,8.9,18.0,0.0,2.0,9.6,,,WNW,...,81.0,50.0,1022.2,1019.1,1.0,1.0,10.8,17.4,No,0.0
4,1,Warm temperate,22.9,40.6,0.0,8.4,,WSW,120.0,WNW,...,56.0,34.0,1011.5,1007.4,1.0,7.0,30.5,36.1,No,27.4


In [290]:
# 剔除气候中间的逗号和两边的空格
X_train['Location'] = X_train['Location'].apply(lambda x: re.sub(',', '', x.strip()))
X_train.head()

Unnamed: 0,Month,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM
0,6,Mild temperate,10.5,17.6,18.6,4.2,7.8,,,NW,...,82.0,68.0,1012.3,1015.0,2.0,2.0,12.0,16.2,No,0.0
1,12,High humidity summer warm winter,26.5,33.9,0.0,5.2,4.3,SSE,35.0,NNW,...,65.0,59.0,1012.2,1009.1,7.0,7.0,31.0,32.6,No,0.0
2,2,Warm temperate,26.6,43.4,0.0,15.6,10.0,SSE,54.0,NW,...,18.0,11.0,1012.9,1009.8,0.0,1.0,33.3,42.0,No,0.0
3,7,Warm temperate,8.9,18.0,0.0,2.0,9.6,,,WNW,...,81.0,50.0,1022.2,1019.1,1.0,1.0,10.8,17.4,No,0.0
4,1,Warm temperate,22.9,40.6,0.0,8.4,,WSW,120.0,WNW,...,56.0,34.0,1011.5,1007.4,1.0,7.0,30.5,36.1,No,27.4


In [291]:
X_train = X_train.rename(columns={'Location': 'Climate'})
X_train.head()

Unnamed: 0,Month,Climate,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM
0,6,Mild temperate,10.5,17.6,18.6,4.2,7.8,,,NW,...,82.0,68.0,1012.3,1015.0,2.0,2.0,12.0,16.2,No,0.0
1,12,High humidity summer warm winter,26.5,33.9,0.0,5.2,4.3,SSE,35.0,NNW,...,65.0,59.0,1012.2,1009.1,7.0,7.0,31.0,32.6,No,0.0
2,2,Warm temperate,26.6,43.4,0.0,15.6,10.0,SSE,54.0,NW,...,18.0,11.0,1012.9,1009.8,0.0,1.0,33.3,42.0,No,0.0
3,7,Warm temperate,8.9,18.0,0.0,2.0,9.6,,,WNW,...,81.0,50.0,1022.2,1019.1,1.0,1.0,10.8,17.4,No,0.0
4,1,Warm temperate,22.9,40.6,0.0,8.4,,WSW,120.0,WNW,...,56.0,34.0,1011.5,1007.4,1.0,7.0,30.5,36.1,No,27.4


In [292]:
# 同样的方法处理测试集上的数据
Xtest = X_test.copy()
X_test['Location'] = X_test['Location'].map(locafinal.iloc[:, 0])
X_test['Location'] = X_test['Location'].apply(lambda x: re.sub(',', '', x.strip()))
X_test = X_test.rename(columns={'Location': 'Climate'})
X_test.head()



Unnamed: 0,Month,Climate,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM
0,10,Hot dry summer cool winter,14.5,26.8,15.2,0.6,,NE,35.0,E,...,89.0,52.0,1013.0,1008.6,7.0,7.0,19.1,24.8,No,34.0
1,10,Hot dry summer cool winter,6.3,30.4,0.0,,,SSE,33.0,ESE,...,51.0,14.0,,,,,19.2,29.7,No,0.0
2,4,Warm temperate,9.7,24.7,0.0,4.0,10.6,SE,28.0,WNW,...,74.0,52.0,1023.0,1019.5,1.0,1.0,17.1,22.8,No,0.0
3,1,Warm temperate,11.9,26.5,0.0,,,W,37.0,WNW,...,59.0,47.0,1010.0,1006.2,,8.0,19.2,23.1,No,0.0
4,8,Warm temperate,13.2,19.0,40.6,4.2,0.8,ENE,46.0,NE,...,89.0,87.0,1008.2,1004.6,7.0,7.0,17.7,16.5,No,39.0


In [294]:
X_test.shape

(42657, 23)

In [295]:
# 接下来处理缺失值
# 收先查看缺失值的情况
X_train.isnull().mean()

Month            0.000000
Climate          0.000000
MinTemp          0.004531
MaxTemp          0.002321
Rainfall         0.009946
Evaporation      0.430229
Sunshine         0.479580
WindGustDir      0.066018
WindGustSpeed    0.065606
WindDir9am       0.069866
WindDir3pm       0.026654
WindSpeed9am     0.009484
WindSpeed3pm     0.018446
Humidity9am      0.012699
Humidity3pm      0.025620
Pressure9am      0.099575
Pressure3pm      0.099404
Cloud9am         0.377985
Cloud3pm         0.403012
Temp9am          0.006460
Temp3pm          0.019300
RainToday        0.009946
RISK_MM          0.000000
dtype: float64

In [296]:
# 首先我们先找出分类类型有哪些
cate = X_train.columns[X_train.dtypes == 'object'].tolist()
# 还有我们的cloud
cloud = ['Cloud9am', 'Cloud3pm']
cate = cate + cloud
cate

['Climate',
 'WindGustDir',
 'WindDir9am',
 'WindDir3pm',
 'RainToday',
 'Cloud9am',
 'Cloud3pm']

In [297]:
# 接下来开始使用众数进行填补
from sklearn.impute import SimpleImputer
si = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
si.fit(X_train.loc[:, cate])
X_train.loc[:, cate] = si.transform(X_train.loc[:, cate])
X_test.loc[:, cate] = si.transform(X_test.loc[:, cate])


In [298]:
# 再次查看确认缺失值是否被填充
# X_train.loc[:, cate].isnull().mean()
# X_test.loc[:, cate].isnull().mean()

In [299]:
# 将所有的分类变量编码为数字，一个类别为一个数字
oe = preprocessing.OrdinalEncoder()
oe.fit(X_train.loc[:, cate])
X_train.loc[:, cate] = oe.transform(X_train.loc[:, cate])
X_test.loc[:, cate] = oe.transform(X_test.loc[:, cate])


In [300]:
# X_train.loc[:, cate].head()
X_test.loc[:, cate].head()

Unnamed: 0,Climate,WindGustDir,WindDir9am,WindDir3pm,RainToday,Cloud9am,Cloud3pm
0,2.0,4.0,0.0,5.0,0.0,7.0,7.0
1,2.0,10.0,2.0,10.0,0.0,7.0,7.0
2,6.0,9.0,14.0,0.0,0.0,1.0,1.0
3,6.0,13.0,14.0,9.0,0.0,7.0,8.0
4,6.0,1.0,4.0,6.0,0.0,7.0,7.0


In [301]:
# 接下来处理连续型变量的缺失值
col = X_train.columns.tolist()
# 将col中删除分类型column
[col.remove(i) for i in cate]
impmean = SimpleImputer(missing_values=np.nan, strategy='mean')
impmean = impmean.fit(X_train.loc[:, col])
X_train.loc[:, col] = impmean.transform(X_train.loc[:, col])
X_test.loc[:, col] = impmean.transform(X_test.loc[:, col])

In [302]:
# X_train.loc[:, col].isnull().mean()
X_test.loc[:, col].isnull().mean()

Month            0.0
MinTemp          0.0
MaxTemp          0.0
Rainfall         0.0
Evaporation      0.0
Sunshine         0.0
WindGustSpeed    0.0
WindSpeed9am     0.0
WindSpeed3pm     0.0
Humidity9am      0.0
Humidity3pm      0.0
Pressure9am      0.0
Pressure3pm      0.0
Temp9am          0.0
Temp3pm          0.0
RISK_MM          0.0
dtype: float64

In [303]:
# 接着处理连续性变量：无量纲话
col.remove('Month')
ss = preprocessing.StandardScaler().fit(X_train.loc[:, col])
X_train.loc[:, col] = ss.transform(X_train.loc[:, col])
X_test.loc[:, col] = ss.transform(X_test.loc[:, col])

In [304]:
# X_train
# X_test
# y_train
y_test

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,1
5,0
6,0
7,0
8,1
9,1


In [306]:
# 模型预处理基本已经完成
# 接下来是建立模型和模型评估
# (99533, 1)
# y_train.shape
# (99533, 23)
# X_train.shape

X_test.shape
# y_test.shape

(42657, 23)

In [310]:
import time
import datetime
from sklearn import metrics

# Ytrain = y_train.copy()
# Ytest = y_test.copy()
y_train = y_train.iloc[:, 0].ravel()
y_test = y_test.iloc[:, 0].ravel()


In [314]:
start = time.time()
for kernel in ['linear', 'rbf', 'poly', 'sigmoid']:
    clf = svm.SVC(kernel=kernel, 
                  gamma=2,
                  degree=1,
                  cache_size=5000
                 ).fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    # 召回率 召回率越大，尽可能的捕捉少数类
    recall = metrics.recall_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, clf.decision_function(X_test))
    print('%s 的准确率为：%f, 召回率为：%f, roc面积为：%f' % (kernel, score, recall, auc))
    print(datetime.datetime.fromtimestamp(time.time() - start).strftime('%M:%S:%f'))


linear 的准确率为：0.999836, 召回率为：0.999377, roc面积为：0.999999
00:37:992269
rbf 的准确率为：0.774433, 召回率为：0.000208, roc面积为：0.775719
59:04:910170
poly 的准确率为：0.999836, 召回率为：0.999273, roc面积为：1.000000
59:41:307349
sigmoid 的准确率为：0.680756, 召回率为：0.243869, roc面积为：0.526741
05:27:338743


In [315]:
# 下面开始调参，我们有不同的目标
# 第一种：我们希望不计一切代价判断出少数类，得出最高的recall
# 第二种：我们希望最求最高的预测准确率，一切目的都是为了accuracy更高，我们在意roc和auc
# 第三种：我们希望达到recall，roc和accuracy之间的一个平衡，不追求任何一个牺牲任何一个



In [None]:
# 第一种：我们追求更好的recall
start = time.time()
for kernel in ['linear', 'rbf', 'poly', 'sigmoid']:
    clf = svm.SVC(kernel=kernel, 
                  gamma=2,
                  degree=1,
                  cache_size=5000,
                  class_weight='balanced' # 权重与样本出现的频率成反比
                 ).fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    # 召回率 召回率越大，尽可能的捕捉少数类
    recall = metrics.recall_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, clf.decision_function(X_test))
    print('%s 的准确率为：%f, 召回率为：%f, roc面积为：%f' % (kernel, score, recall, auc))
    print(datetime.datetime.fromtimestamp(time.time() - start).strftime('%M:%S:%f'))

In [317]:
# 由上述结果 我们锁定使用线性核函数，接着不惜一切代价提升recall
start = time.time()
clf = svm.SVC(kernel='linear',
              gamma=2,
              degree=1,
              cache_size=5000,
              class_weight={1:10} # 注意，这里写的其实是 类别1权重：10， 类别0的权重：1这个比例
             ).fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = clf.score(X_test, y_test)
recall = metrics.recall_score(y_test, y_pred)
# auc 面积
auc = metrics.roc_auc_score(y_test, clf.decision_function(X_test))
print('%s 的准确率为：%f, 召回率为：%f, roc面积为：%f' % ('linear', score, recall, auc))
print(datetime.datetime.fromtimestamp(time.time() - start).strftime('%M:%S:%f'))


sigmoid 的准确率为：0.991983, 召回率为：1.000000, roc面积为：1.000000
03:09:595517


In [318]:
# 第二种：追求最高的准确率，我不惜一切代价 提升准确率
# 通过混淆矩阵查看特异度 特异度越高对多数类的误伤能力越强
clf = svm.SVC(kernel='linear',
              gamma=2,
              degree=1,
              cache_size=5000
             ).fit(X_train, y_train)
y_pred = clf.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred, labels=[1, 0])
cm

array([[ 9618,     6],
       [    1, 33032]])

In [319]:
# 特异度
specificity = cm[1, 1] / cm[1, :].sum()
specificity # 几乎所的0都被判断正确

0.9999697272424545

In [324]:
# 通过调节class_weight 使样本尽可能多捕捉少数类 来提升准确率
i_range = np.linspace(0.01, 0.05, 10)
for i in i_range:
    start = time.time()
    clf = svm.SVC(kernel='linear', 
                  gamma=2,
                  degree=1,
                  cache_size=5000,
                  class_weight={1: i} 
                 ).fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    # 召回率 召回率越大，尽可能的捕捉少数类
    recall = metrics.recall_score(y_test, y_pred)
    auc = metrics.roc_auc_score(y_test, clf.decision_function(X_test))
    print("under ratio 1:%f testing accuracy %f, recall is %f', auc is %f" %(1+i, score,recall, auc))
    print(datetime.datetime.fromtimestamp(time.time()-start).strftime("%M:%S:%f"))

under ratio 1:1.010000 testing accuracy 0.955787, recall is 0.804032', auc is 1.000000
01:01:987432
under ratio 1:1.014444 testing accuracy 0.963101, recall is 0.836451', auc is 1.000000
00:56:961150
under ratio 1:1.018889 testing accuracy 0.968891, recall is 0.862116', auc is 1.000000
00:49:566395
under ratio 1:1.023333 testing accuracy 0.971306, recall is 0.872818', auc is 1.000000
00:48:459459
under ratio 1:1.027778 testing accuracy 0.971329, recall is 0.872922', auc is 1.000000
00:45:980853
under ratio 1:1.032222 testing accuracy 0.971353, recall is 0.873026', auc is 1.000000
00:47:484619
under ratio 1:1.036667 testing accuracy 0.974049, recall is 0.884975', auc is 1.000000
00:48:001760
under ratio 1:1.041111 testing accuracy 0.975690, recall is 0.892249', auc is 1.000000
00:49:857222
under ratio 1:1.045556 testing accuracy 0.977073, recall is 0.898379', auc is 1.000000
00:46:129375
under ratio 1:1.050000 testing accuracy 0.979722, recall is 0.910121', auc is 1.000000
00:47:427276


In [None]:
# 第三种：追求平衡
# 既然调节class_weight并不能够使我们的模型有较大的改善，现在我们调节线性核函数的C看看

