In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsOneClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn import neighbors
from collections import defaultdict
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
import datetime
import gc



In [2]:
def add_date_features(df):
    df["date"] = pd.to_datetime(df["time_stamp"])
    df["date_year"] = df["date"].dt.year
    df["date_month"] = df["date"].dt.month
    df["date_day"] = df["date"].dt.day
    df["date_hour"] = df["date"].dt.hour
    df["date_min"] = df["date"].dt.minute
    return df

In [3]:
shop_info = pd.read_csv('train_ccf_first_round_shop_info.csv')
user_shop_info = pd.read_csv('train_ccf_first_round_user_shop_behavior.csv')
testA_B = pd.read_csv('AB_test_evaluation_public.csv') 
shop_info['category_id'] = shop_info['category_id'].str.split('c_', expand=True)[1].astype(np.int32)

建立商铺类型字典
每个类型key对应若干个shop_id
暂时没用。。

In [17]:
cat_shop = defaultdict(lambda : [])
for line in shop_info.values:
    cat_shop[line[1]].append(line[0])

处理训练集特征

In [4]:
add_date_features(user_shop_info)
lb = LabelBinarizer()
train_wifi_id = []
train_wifi_signal = []
train_wifi_flag = []
for line in user_shop_info.values:
    wifi = sorted([wifi.split('|') for wifi in line[5].split(';')],key=lambda x:int(x[1]),reverse=True)[0]
    train_wifi_id.append(wifi[0])
    train_wifi_signal.append(wifi[1])
    train_wifi_flag.append(wifi[2])
    
user_shop_info['wifi_id'] = train_wifi_id
user_shop_info['wifi_signal'] = train_wifi_signal
user_shop_info['wifi_flag'] = train_wifi_flag
user_shop_info['wifi_flag'] = lb.fit_transform(user_shop_info['wifi_flag'])
user_shop_info['wifi_signal'] = user_shop_info['wifi_signal'].astype(np.int32)
user_shop_info['wifi_id'] = user_shop_info['wifi_id'].str.split('b_', expand=True)[1].astype(np.int32)


user_shop_info['date_day'] = user_shop_info['date_day'].astype(np.int32)
user_shop_info['date_hour'] = user_shop_info['date_hour'].astype(np.int32)
user_shop_info['date_min'] = user_shop_info['date_min'].astype(np.int32)
user_shop_info['longitude'] = user_shop_info['longitude'].astype(np.float32)
user_shop_info['latitude'] = user_shop_info['latitude'].astype(np.float32)


user_shop_info = user_shop_info.drop(['wifi_infos', 'date', 'time_stamp', 'date_year', 'date_month', 'user_id'], axis=1)
user_shop_info

Unnamed: 0,shop_id,longitude,latitude,date_day,date_hour,date_min,wifi_id,wifi_signal,wifi_flag
0,s_2871718,122.308289,32.088039,6,21,20,6396479,-55,0
1,s_2871718,122.308159,32.087971,6,21,20,6396479,-57,0
2,s_181637,117.365257,40.638214,2,13,10,2485110,-52,0
3,s_609470,121.134453,31.197416,13,12,30,30424471,-60,0
4,s_3816766,122.255867,31.351320,25,19,50,39004148,-58,0
5,s_3686420,122.595276,31.581963,28,13,10,21608835,-44,0
6,s_89874,104.815315,30.962847,11,14,20,975810,-33,1
7,s_939447,117.913712,39.589680,6,14,50,13067253,-60,0
8,s_939447,117.913765,39.589699,2,17,30,42938093,-34,0
9,s_3698335,117.914185,39.589539,2,18,30,25674236,-51,0


按是否是周末、是否是早上、是否是下午、是否是晚上、是否是饭点构建新特征

In [5]:
weekend = [4, 5, 6, 11, 12, 13, 18, 19, 20, 25, 26, 27]
is_weekend = []
is_morning = []
is_afternoon = []
is_evening = []
is_dinner = []
for line in user_shop_info.values:
    if line[3] in weekend:
        is_weekend.append(1)
    else:
        is_weekend.append(0)

for line in user_shop_info.values:
    if (line[4] >= 0) & (line[4] <= 8):
        is_morning.append(1)
    else:
        is_morning.append(0)
        
for line in user_shop_info.values:
    if (line[4] >= 9) & (line[4] <= 16):
        is_afternoon.append(1)
    else:
        is_afternoon.append(0)
        
for line in user_shop_info.values:
    if (line[4] >= 17) & (line[4] <= 23):
        is_evening.append(1)
    else:
        is_evening.append(0)

for line in user_shop_info.values:
    if ((line[4] >= 11) & (line[4] <= 13)) | ((line[4] >= 17) & (line[4] <= 19)):
        is_dinner.append(1)
    else:
        is_dinner.append(0)
        
user_shop_info['is_weekend'] = is_weekend
user_shop_info['is_morning'] = is_morning
user_shop_info['is_afternoon'] = is_afternoon
user_shop_info['is_evening'] = is_evening
user_shop_info['is_dinner'] = is_dinner
user_shop_info

Unnamed: 0,shop_id,longitude,latitude,date_day,date_hour,date_min,wifi_id,wifi_signal,wifi_flag,is_weekend,is_morning,is_afternoon,is_evening,is_dinner
0,s_2871718,122.308289,32.088039,6,21,20,6396479,-55,0,1,0,0,1,0
1,s_2871718,122.308159,32.087971,6,21,20,6396479,-57,0,1,0,0,1,0
2,s_181637,117.365257,40.638214,2,13,10,2485110,-52,0,0,0,1,0,1
3,s_609470,121.134453,31.197416,13,12,30,30424471,-60,0,1,0,1,0,1
4,s_3816766,122.255867,31.351320,25,19,50,39004148,-58,0,1,0,0,1,1
5,s_3686420,122.595276,31.581963,28,13,10,21608835,-44,0,0,0,1,0,1
6,s_89874,104.815315,30.962847,11,14,20,975810,-33,1,1,0,1,0,0
7,s_939447,117.913712,39.589680,6,14,50,13067253,-60,0,1,0,1,0,0
8,s_939447,117.913765,39.589699,2,17,30,42938093,-34,0,0,0,0,1,1
9,s_3698335,117.914185,39.589539,2,18,30,25674236,-51,0,0,0,0,1,1


获得第一周同一商场每个店铺的客流量，暂时没用

In [33]:
for mall_id in mall_ids[:1]:
    shop_ids_by_mall_id = shop_info[shop_info.mall_id == mall_id]
    data_by_mall_id = user_shop_info[user_shop_info['shop_id'].isin(shop_ids_by_mall_id['shop_id'])]
    shop_1_week = data_by_mall_id[(data_by_mall_id['date_day'] >=1) & (data_by_mall_id['date_day'] <=6)]


price_shop_1_week = defaultdict(lambda : 0)
for line in shop_1_week.values:
    price_shop_1_week[line[0]] = price_shop_1_week[line[0]] + 1
    
price_shop_1_week.values()

dict_values([43, 21, 24, 57, 3, 25, 11, 39, 100, 22, 89, 203, 6, 118, 364, 48, 11, 8, 36, 7, 68, 10, 155, 48, 7, 46, 21, 6, 4, 4, 41, 73, 36, 16, 14, 179, 38, 30, 23, 47, 24, 18, 46, 185, 21, 6, 67, 4, 7, 11, 14, 22, 21, 14, 37, 27, 18, 9, 6, 60, 29, 94, 25, 66, 61, 1, 32, 38, 37, 5, 3, 1, 44, 32, 30, 42, 159, 16, 8, 13, 14, 108, 10, 27, 8, 15, 4, 46, 66, 26, 20, 30, 8, 18, 6, 33, 25, 9, 2, 185, 62, 87, 22, 39, 50, 2, 86, 4, 10, 29, 38, 29, 57, 4, 13, 2, 28, 30, 12, 11])

处理测试集特征

In [6]:
add_date_features(testA_B)
lb = LabelBinarizer()
test_wifi_id = []
test_wifi_signal = []
test_wifi_flag = []
for line in testA_B.values:
    wifi = sorted([wifi.split('|') for wifi in line[6].split(';')],key=lambda x:int(x[1]),reverse=True)[0]
    test_wifi_id.append(wifi[0])
    test_wifi_signal.append(wifi[1])
    test_wifi_flag.append(wifi[2])


testA_B['wifi_id'] = test_wifi_id
testA_B['wifi_signal'] = test_wifi_signal
testA_B['wifi_flag'] = test_wifi_flag
testA_B['wifi_flag'] = lb.fit_transform(testA_B['wifi_flag'])
testA_B['wifi_id'] = testA_B['wifi_id'].str.split('b_', expand=True)[1].astype(np.int32)
testA_B['wifi_signal'] = testA_B['wifi_signal'].astype(np.int32)

testA_B['date_day'] = testA_B['date_day'].astype(np.int32)
testA_B['date_hour'] = testA_B['date_hour'].astype(np.int32)
testA_B['date_min'] = testA_B['date_min'].astype(np.int32)
testA_B['longitude'] = testA_B['longitude'].astype(np.float32)
testA_B['latitude'] = testA_B['latitude'].astype(np.float32)


testA_B = testA_B.drop(['wifi_infos', 'date', 'time_stamp', 'date_year', 'date_month', 'user_id'], axis=1)

testA_B

Unnamed: 0,row_id,mall_id,longitude,latitude,date_day,date_hour,date_min,wifi_id,wifi_signal,wifi_flag
0,118742,m_3916,122.141014,39.818848,5,13,0,37756289,-53,0
1,118743,m_5085,118.191910,32.855858,6,13,10,49853639,-68,0
2,118744,m_4033,119.192108,32.424667,6,17,40,40924464,-54,0
3,118745,m_4515,120.612198,34.055248,3,12,10,52869345,-42,0
4,118746,m_7168,116.861992,40.326859,2,20,40,29284311,-42,0
5,118747,m_4079,117.365334,40.638302,1,13,20,33919931,-49,0
6,118748,m_2333,120.494446,36.455074,6,12,20,16608380,-50,0
7,118749,m_1175,120.744362,30.814718,7,21,10,8324226,-48,0
8,118750,m_1409,122.309341,32.086750,3,10,30,11907579,-56,0
9,118751,m_3517,104.417603,31.286158,2,19,0,32048186,-44,0


按是否是周末、是否是早上、是否是下午、是否是晚上、是否是饭点构建新特征

In [24]:
weekend = [4, 5, 6, 11, 12, 13, 18, 19, 20, 25, 26, 27]
is_weekend = []
is_morning = []
is_afternoon = []
is_evening = []
is_dinner = []
for line in testA_B.values:
    if line[4] in weekend:
        is_weekend.append(1)
    else:
        is_weekend.append(0)

for line in testA_B.values:
    if (line[5] >= 0) & (line[5] <= 8):
        is_morning.append(1)
    else:
        is_morning.append(0)
        
for line in testA_B.values:
    if (line[5] >= 9) & (line[5] <= 16):
        is_afternoon.append(1)
    else:
        is_afternoon.append(0)
        
for line in testA_B.values:
    if (line[5] >= 17) & (line[5] <= 23):
        is_evening.append(1)
    else:
        is_evening.append(0)
        
for line in testA_B.values:
    if ((line[5] >= 11) & (line[5] <= 13)) | ((line[5] >= 17) & (line[5] <= 19)):
        is_dinner.append(1)
    else:
        is_dinner.append(0)
        
testA_B['is_weekend'] = is_weekend
testA_B['is_morning'] = is_morning
testA_B['is_afternoon'] = is_afternoon
testA_B['is_evening'] = is_evening
testA_B['is_dinner'] = is_dinner
testA_B

Unnamed: 0,row_id,mall_id,longitude,latitude,date_day,date_hour,date_min,wifi_id,wifi_signal,wifi_flag,is_weekend,is_morning,is_afternoon,is_evening,is_dinner
0,118742,m_3916,122.141014,39.818848,5,13,0,37756289,-53,0,1,0,1,0,1
1,118743,m_5085,118.191910,32.855858,6,13,10,49853639,-68,0,1,0,1,0,1
2,118744,m_4033,119.192108,32.424667,6,17,40,40924464,-54,0,1,0,0,1,1
3,118745,m_4515,120.612198,34.055248,3,12,10,52869345,-42,0,0,0,1,0,1
4,118746,m_7168,116.861992,40.326859,2,20,40,29284311,-42,0,0,0,0,1,0
5,118747,m_4079,117.365334,40.638302,1,13,20,33919931,-49,0,0,0,1,0,1
6,118748,m_2333,120.494446,36.455074,6,12,20,16608380,-50,0,1,0,1,0,1
7,118749,m_1175,120.744362,30.814718,7,21,10,8324226,-48,0,0,0,0,1,0
8,118750,m_1409,122.309341,32.086750,3,10,30,11907579,-56,0,0,0,1,0,0
9,118751,m_3517,104.417603,31.286158,2,19,0,32048186,-44,0,0,0,0,1,1


获得所有商场id

In [8]:
mall_ids = list( shop_info.ix[:,'mall_id'].unique() )

创建复合字典：每个商场的客流量price均值对应若干个同一商场的shop_id

In [15]:
price_shop = defaultdict(lambda : defaultdict(lambda : []))
for mall_id in mall_ids[:10]:
    shop_ids_by_mall_id = shop_info[shop_info.mall_id == mall_id]
    #data_by_mall_id = user_shop_info[user_shop_info['shop_id'].isin(shop_ids_by_mall_id['shop_id'])]
    per_mall_avg_price = shop_ids_by_mall_id.price.mean()
    price_shop[per_mall_avg_price][mall_id].append(shop_ids_by_mall_id.shop_id.values)
    
price_shop

defaultdict(<function __main__.<lambda>>,
            {48.482352941176472: defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
                         {'m_3739': [array(['s_462', 's_4244', 's_4550', 's_35936', 's_43983', 's_44872',
                                  's_68678', 's_71846', 's_133772', 's_137744', 's_161194',
                                  's_212532', 's_240201', 's_242485', 's_246939', 's_262507',
                                  's_350316', 's_359471', 's_386146', 's_386147', 's_389945',
                                  's_399584', 's_410705', 's_412876', 's_413128', 's_414483',
                                  's_417391', 's_426191', 's_435292', 's_441326', 's_442713',
                                  's_591054', 's_598913', 's_599777', 's_658222', 's_659057',
                                  's_725240', 's_731626', 's_747267', 's_748798', 's_774954',
                                  's_816804', 's_824042', 's_831973', 's_869642', 's_878890',
         

knn模型预测

In [167]:
count = 0

result = pd.DataFrame()
row_and_shop_list_resuslt = pd.DataFrame()

#一共有97个mall_id
for mall_id in mall_ids[:98]:
    #每个mall训练前清理结果列表，初始化index，防止不同mall中样本数量不同出错
    row_and_shop_list_resuslt.drop(row_and_shop_list_resuslt.index,inplace=True)
    count1 = 0 
    
    #每10个mall 输出一次
    count = count + 1
    if(count % 10 == 0):
        print(count)
    
    #获得同一个mall_id中的所有训练集样本信息
    shop_ids_by_mall_id = shop_info[shop_info.mall_id == mall_id]
    data_by_mall_id = user_shop_info[user_shop_info['shop_id'].isin(shop_ids_by_mall_id['shop_id'])]
    data_by_mall_id = data_by_mall_id.drop(['date_day', 'date_hour', 'date_min'], axis=1)
    
    #获得同一个mall_id中的所有测试集样本信息
    test_data_by_mall_id = testA_B[testA_B['mall_id'] == mall_id]
    #针对每个mall_id创建对应的预测结果存放列表
    row_and_shop_list_resuslt['row_id'] = test_data_by_mall_id.row_id
    test_data_by_mall_id = test_data_by_mall_id.drop(['mall_id', 'row_id', 'date_day', 'date_hour', 'date_min'], axis=1)
    
    #获得特征和目标值
    X = data_by_mall_id.ix[:, 1:]
    y = data_by_mall_id.ix[:, :1]
#    x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.19)  
    clf = neighbors.KNeighborsClassifier(algorithm='auto', n_neighbors=4)  
    clf.fit(x_train, y_train)
    
 #注释部分将训练集的百分之80取样训练模型，剩下百分之20作为验证集，计算线下分数   
 #   answer1 = clf.predict(x_valid)
    
 #   for j in range(len(answer1)):
 #       if answer1[j] == np.array(y_valid)[j]:
 #           count1 = count1 + 1

 #   print(1.0 * count1 / len(answer1))   
 #   print(mall_id)
 #   print(answer1)

    
    #将每个mall_id的预测结果聚合
    row_and_shop_list_resuslt['shop_id'] = pd.Series(clf.predict(test_data_by_mall_id)).values
    
    result = result.append(row_and_shop_list_resuslt, ignore_index=True)
        



10
20
30
40
50
60
70
80
90


m_7800号商场误差太大，单独调出来分析。。。。

In [153]:
shop_ids_by_mall_id = shop_info[shop_info.mall_id == 'm_7800']
data_by_mall_id = user_shop_info[user_shop_info['shop_id'].isin(shop_ids_by_mall_id['shop_id'])]
data_by_mall_id.shop_id.value_counts()

s_685058     1814
s_684547      971
s_683678      762
s_683985      734
s_683671      726
s_684245      630
s_683674      617
s_683832      599
s_683821      598
s_809873      542
s_683053      521
s_682074      472
s_684555      467
s_696563      454
s_698556      438
s_685227      424
s_684235      423
s_3382317     410
s_679110      394
s_539726      380
s_662370      342
s_683217      312
s_461325      304
s_681760      288
s_521874      278
s_2230096     259
s_3421010     242
s_2248290     236
s_712117      228
s_466068      228
             ... 
s_570408       66
s_1114306      62
s_696015       58
s_3101804      57
s_557771       57
s_1488950      57
s_478200       56
s_454451       51
s_735757       48
s_3257365      46
s_1444403      41
s_1093389      41
s_709290       36
s_3284083      34
s_3328792      34
s_3832595      34
s_528544       32
s_622845       31
s_546469       30
s_3744077      28
s_1462898      27
s_473565       27
s_769794       26
s_3791261      25
s_534067  

In [164]:
count1 = 0
result = pd.DataFrame()
row_and_shop_list_resuslt = pd.DataFrame()
row_and_shop_list_resuslt.drop(row_and_shop_list_resuslt.index,inplace=True)

shop_ids_by_mall_id = shop_info[shop_info.mall_id == 'm_7800']
data_by_mall_id = user_shop_info[user_shop_info['shop_id'].isin(shop_ids_by_mall_id['shop_id'])]
data_by_mall_id = data_by_mall_id.drop(['date_day', 'date_hour', 'date_min'], axis=1)
    
#test_data_by_mall_id = testA_B[testA_B['mall_id'] == mall_id]
#row_and_shop_list_resuslt['row_id'] = test_data_by_mall_id.row_id
    
#test_data_by_mall_id = test_data_by_mall_id.drop(['mall_id', 'row_id', 'date_day', 'date_hour', 'date_min'], axis=1)
X = data_by_mall_id.ix[:, 1:]
y = data_by_mall_id.ix[:, :1]
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.19, random_state=1)  
    
clf = lgb.LGBMClassifier(silent=False, n_estimators=40, learning_rate=0.13, subsample=0.8).fit(x_train, y_train)
#clf = GradientBoostingClassifier(n_estimators=10, subsample=0.7).fit(x_train, y_train)
answer1 = clf.predict(x_valid)
    
for j in range(len(answer1)):
    if answer1[j] == np.array(y_valid)[j]:
        count1 = count1 + 1

print(1.0 * count1 / len(answer1))    

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.5986666666666667


使用随机森林模型

In [11]:
count = 0
result = pd.DataFrame()

for mall_id in mall_ids[:1]:
    count1 = 0 
    
    count = count + 1
    if(count % 10 == 0):
        print(count)
    shop_ids_by_mall_id = shop_info[shop_info.mall_id == mall_id]
    data_by_mall_id = user_shop_info[user_shop_info['shop_id'].isin(shop_ids_by_mall_id['shop_id'])]
    
    X = data_by_mall_id.ix[:, 1:]
    y = data_by_mall_id.ix[:, :1]
    y['shop_id'] = y['shop_id'].str.split('s_', expand=True)[1].astype(np.int32)
    x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.19)
    
    forest = RandomForestClassifier(n_estimators=50, random_state=1)
    multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
    multi_target_forest.fit(x_train, y_train)
    answer1 = multi_target_forest.predict(x_valid)
    
    for j in range(len(answer1)):
        if answer1[j] == np.array(y_valid)[j]:
            count1 = count1 + 1

    print(1.0 * count1 / len(answer1)) 




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.777668759811617


字符串拼接，暂时没用。。。

In [53]:
pd.Series(['s_']).str.cat([pd.Series(a.shop_id[0].astype(np.str))], sep='')

0    s_633878
dtype: object

使用XGboost模型

In [18]:
count = 0
result = pd.DataFrame()
for mall_id in mall_ids[:1]:
    count1 = 0 
    
    count = count + 1
    if(count % 10 == 0):
        print(count)
    shop_ids_by_mall_id = shop_info[shop_info.mall_id == mall_id]
    data_by_mall_id = user_shop_info[user_shop_info['shop_id'].isin(shop_ids_by_mall_id['shop_id'])]
    
    #test_data_by_mall_id = testA_B[testA_B['mall_id'] == mall_id]
    #test_data_by_mall_id = test_data_by_mall_id.drop(['mall_id', 'row_id'], axis=1)
    X = data_by_mall_id.ix[:, 1:]
    y = data_by_mall_id.ix[:, :1]
    y['shop_id'] = y['shop_id'].str.split('s_', expand=True)[1].astype(np.int32)
    x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.19)
    
    bst = xgb.XGBClassifier(n_estimators=10).fit(x_train, y_train)
    
    answer1 = bst.predict(x_valid)
    
    for j in range(len(answer1)):
        if answer1[j] == np.array(y_valid)[j]:
            count1 = count1 + 1

    print(1.0 * count1 / len(answer1)) 
        #print(result.append(pd.Series(mid_result), ignore_index=True))    
        
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.7498037676609105


使用lightGBM模型 作为线上预测结果

In [25]:
count = 0

result = pd.DataFrame()
row_and_shop_list_resuslt = pd.DataFrame()
for mall_id in mall_ids[:98]:
    row_and_shop_list_resuslt.drop(row_and_shop_list_resuslt.index,inplace=True)
    count1 = 0 
    
    count = count + 1
    if(count % 10 == 0):
        print(count)
    shop_ids_by_mall_id = shop_info[shop_info.mall_id == mall_id]
    data_by_mall_id = user_shop_info[user_shop_info['shop_id'].isin(shop_ids_by_mall_id['shop_id'])]
    data_by_mall_id = data_by_mall_id.drop(['date_day', 'date_hour', 'date_min'], axis=1)
    
    test_data_by_mall_id = testA_B[testA_B['mall_id'] == mall_id]
    row_and_shop_list_resuslt['row_id'] = test_data_by_mall_id.row_id
    
    test_data_by_mall_id = test_data_by_mall_id.drop(['mall_id', 'row_id', 'date_day', 'date_hour', 'date_min'], axis=1)
    X = data_by_mall_id.ix[:, 1:]
    y = data_by_mall_id.ix[:, :1]   
    
    lgb_pred = lgb.LGBMClassifier( silent=False, n_estimators=30, learning_rate=0.13).fit(X, y)   
    
    row_and_shop_list_resuslt['shop_id'] = pd.Series(lgb_pred.predict(test_data_by_mall_id)).values
    result = result.append(row_and_shop_list_resuslt, ignore_index=True)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


10


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


20


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


30


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


40


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


50


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


60


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


70


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


80


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


90


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


写入文件

In [26]:
result

Unnamed: 0,row_id,shop_id
0,118979,s_1756
1,119032,s_9147
2,119068,s_9147
3,119149,s_287617
4,119155,s_455479
5,119182,s_405021
6,119215,s_298312
7,119279,s_9147
8,119295,s_149642
9,119485,s_149642


In [27]:
result.to_csv('result.csv', index=False)

多个模型结果按次数融合，正在写。

In [None]:
count = 0
for mall_id in mall_ids[:10]:
    count1 = 0
    count = count + 1
    if(count % 10 == 0):
        print(count)
    shop_ids_by_mall_id = shop_info[shop_info.mall_id == mall_id]
    data_by_mall_id = user_shop_info[user_shop_info['shop_id'].isin(shop_ids_by_mall_id['shop_id'])]
    data_by_mall_id = data_by_mall_id.drop(['date_day', 'date_hour', 'date_min'], axis=1)

    X = data_by_mall_id.ix[:, 1:]
    
    y = data_by_mall_id.ix[:, :1]
    x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.19)  
    clf = neighbors.KNeighborsClassifier(algorithm='auto', n_neighbors=4).fit(x_train, y_train)
    
    clf_1 = lgb.LGBMClassifier(silent=False, n_estimators=40, learning_rate=0.13, subsample=0.8).fit(x_train, y_train)

    
    answer1 = clf.predict(x_valid)
    
    for j in range(len(answer1)):
        if answer1[j] == np.array(y_valid)[j]:
            count1 = count1 + 1

    print(1.0 * count1 / len(answer1))