In [None]:
"""
需求: 预测Facebook签到位置
流程分析:
    1. 获取数据
    2. 数据处理
        特征值: x
        目标值: y
        a. 缩小范围: 2<x<2.5 1<y<1.5 
        b. time: 转化为年月日时分秒
        c. 过滤掉签到次数少的地方
        数据集划分
    3. 特征工程
        标准化
    4. KNN算法估计器
    5. 模型选择与优化
    6. 模型评估
"""


In [1]:
import pandas as pd

In [2]:
# 1. 获取数据
data = pd.read_csv('./facebook/train.csv')

In [4]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [5]:
# 2. 基本数据处理
# 1) 缩小范围
temp = data.query('x<2.5 & x>2 & y<1.5 & y>1')

In [7]:
temp.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id
112,112,2.236,1.3655,66,623174,7663031065
180,180,2.2003,1.2541,65,610195,2358558474
367,367,2.4108,1.3213,74,579667,6644108708
874,874,2.0822,1.1973,320,143566,3229876087
1022,1022,2.016,1.1659,65,207993,3244363975


In [9]:
# 2) 处理时间特征
time_value = pd.to_datetime(data['time'], unit='s')

In [11]:
date = pd.DatetimeIndex(time_value)

In [19]:
# 添加特征词
data['day'] = date.day

In [20]:
data['weekday'] = date.weekday

In [21]:
data['hour'] = date.hour

In [22]:
data.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,weekday,hour
0,0,0.7941,9.0809,54,470702,8523065625,6,1,10
1,1,5.9567,4.7968,13,186555,1757726713,3,5,3
2,2,8.3078,7.0407,74,322648,1137537235,4,6,17
3,3,7.3665,2.5165,65,704587,6567393236,9,4,3
4,4,4.0961,1.1307,31,472130,7440663949,6,1,11


In [28]:
# 3) 过滤签到次数少的地点
place_count = data.groupby('place_id').size()

In [39]:
p_list = place_count[place_count > 3].index.values

In [42]:
data_final = data[data['place_id'].isin(p_list)]

In [43]:
data_final.head()

Unnamed: 0,row_id,x,y,accuracy,time,place_id,day,weekday,hour
0,0,0.7941,9.0809,54,470702,8523065625,6,1,10
1,1,5.9567,4.7968,13,186555,1757726713,3,5,3
2,2,8.3078,7.0407,74,322648,1137537235,4,6,17
3,3,7.3665,2.5165,65,704587,6567393236,9,4,3
4,4,4.0961,1.1307,31,472130,7440663949,6,1,11


In [44]:
# 筛选特征值和目标值
x = data_final[['x','y','accuracy', 'day','weekday','hour']]
y = data_final[['place_id']]

In [45]:
x.head()

Unnamed: 0,x,y,accuracy,day,weekday,hour
0,0.7941,9.0809,54,6,1,10
1,5.9567,4.7968,13,3,5,3
2,8.3078,7.0407,74,4,6,17
3,7.3665,2.5165,65,9,4,3
4,4.0961,1.1307,31,6,1,11


In [46]:
y.head()

Unnamed: 0,place_id
0,8523065625
1,1757726713
2,1137537235
3,6567393236
4,7440663949


In [47]:
from sklearn.model_selection import train_test_split

In [48]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=23)

In [51]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# 3) 特征工程: 标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
# 使用训练集计算(fit)出来的均值和标准值来对测试集进行转化, 保持一致
x_test = transfer.transform(x_test)

# 4) KNN算法预估器
estimator = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [3, 7, 9]
}
estimator = GridSearchCV(estimator, param_grid=param_grid, cv=3)
estimator.fit(x_train, y_train)

# 5) 模型评估
# 方法一: 直接对比真实值和预测值
y_predict = estimator.predict(x_test)
print('y_predict: \n', y_predict)
print('直接对比真实值和预测值: \n', y_test == y_predict)

# 方法二: 计算准确率
score = estimator.score(x_test, y_test)
print('准确率: \n', score)

print('最佳参数:\n', estimator.best_params_)
print('最佳结果:\n', estimator.best_score_)
print('最佳估计器:\n', estimator.best_estimator_)
print('交叉验证结果:\n', estimator.cv_results_)
