In [1]:
# %load 任务7.2 预处理航空客户数据.py

###############################################################################
#######################           任务实现             ########################
###############################################################################

# 代码 7-1
import numpy as np
import pandas as pd
airline_data = pd.read_csv("air_data.csv",
    encoding="gb18030") #导入航空数据
print('原始数据的形状为：',airline_data.shape)
## 去除票价为空的记录
exp1 = airline_data["SUM_YR_1"].notnull()
exp2 = airline_data["SUM_YR_2"].notnull()
exp = exp1 & exp2
airline_notnull = airline_data.loc[exp,:]
print('删除缺失记录后数据的形状为：',airline_notnull.shape)


#只保留票价非零的，或者平均折扣率不为0且总飞行公里数大于0的记录。
index1 = airline_notnull['SUM_YR_1'] != 0
index2 = airline_notnull['SUM_YR_2'] != 0
index3 = (airline_notnull['SEG_KM_SUM']> 0) & \
    (airline_notnull['avg_discount'] != 0)  
airline = airline_notnull[(index1 | index2) & index3]
print('删除异常记录后数据的形状为：',airline.shape)



# 代码 7-2
## 选取需求特征
airline_selection = airline[["FFP_DATE","LOAD_TIME",
    "FLIGHT_COUNT","LAST_TO_END",
    "avg_discount","SEG_KM_SUM"]]
## 构建L特征
L = pd.to_datetime(airline_selection["LOAD_TIME"]) - \
pd.to_datetime(airline_selection["FFP_DATE"])
L = L.astype("str").str.split().str[0]
L = L.astype("int")/30
## 合并特征
airline_features = pd.concat([L,
    airline_selection.iloc[:,2:]],axis = 1)
print('构建的LRFMC特征前5行为：\n',airline_features.head())



# 代码 7-3
from sklearn.preprocessing import StandardScaler
data = StandardScaler().fit_transform(airline_features)
np.savez('airline_scale.npz',data)
print('标准化后LRFMC五个特征为：\n',data[:5,:])



原始数据的形状为： (62988, 44)
删除缺失记录后数据的形状为： (62299, 44)
删除异常记录后数据的形状为： (62044, 44)
构建的LRFMC特征前5行为：
            0  FLIGHT_COUNT  LAST_TO_END  avg_discount  SEG_KM_SUM
0  90.200000           210            1      0.961639      580717
1  86.566667           140            7      1.252314      293678
2  87.166667           135           11      1.254676      283712
3  68.233333            23           97      1.090870      281336
4  60.533333           152            5      0.970658      309928
标准化后LRFMC五个特征为：
 [[ 1.43571897 14.03412875 -0.94495516  1.29555058 26.76136996]
 [ 1.30716214  9.07328567 -0.9119018   2.86819902 13.1269701 ]
 [ 1.32839171  8.71893974 -0.88986623  2.88097321 12.65358345]
 [ 0.65848092  0.78159082 -0.41610151  1.99472974 12.54072306]
 [ 0.38603481  9.92371591 -0.92291959  1.3443455  13.89884778]]


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [2]:
# %load 任务7.3 使用K-Means算法进行客户分群.py

###############################################################################
#######################           任务实现             #######################
###############################################################################

# 代码 7-4
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans #导入kmeans算法
airline_scale = np.load('airline_scale.npz')['arr_0']
print(type(np.load('airline_scale.npz')))
k = 5 ## 确定聚类中心数
#构建模型
kmeans_model = KMeans(n_clusters = k,n_jobs=4,random_state=123)
fit_kmeans = kmeans_model.fit(airline_scale)   #模型训练
print(kmeans_model.cluster_centers_ )#查看聚类中心

print(kmeans_model.labels_ )#查看样本的类别标签

#统计不同类别样本的数目
r1 = pd.Series(kmeans_model.labels_).value_counts()
print('最终每个类别的数目为：\n',r1)



<class 'numpy.lib.npyio.NpzFile'>
[[ 0.05184321 -0.22680493 -0.00266815  2.19136467 -0.23125594]
 [-0.31368082 -0.57402062  1.68627205 -0.1733275  -0.53682451]
 [ 0.48333235  2.48322162 -0.7993897   0.30863251  2.42474345]
 [-0.7002121  -0.16114387 -0.41489162 -0.25513359 -0.16095881]
 [ 1.16067608 -0.08691922 -0.37722423 -0.15590586 -0.09484481]]
[2 2 2 ... 3 1 1]
最终每个类别的数目为：
 3    24659
4    15740
1    12125
2     5336
0     4184
dtype: int64
