In [42]:
!pwd

/nmnt/media/home/amir/myhack/notebooks


In [43]:
!ls -la ../data/

total 28459556
drwxr-xr-x 3 amir amir       4096 сент. 16 15:53 .
drwxrwxr-x 7 amir amir       4096 сент. 17 11:58 ..
-rw-r--r-- 1 amir amir    1252886 сент. 16 03:09 hackathon_tosubmit.tsv
drwxr-xr-x 2 amir amir       4096 сент. 16 15:53 .ipynb_checkpoints
-rw-r--r-- 1 amir amir        628 сент. 16 02:43 test_col_dtypes.json
-rw-r--r-- 1 amir amir 2088371430 сент. 16 04:21 test_kazan_features.tsv
-rw-r--r-- 1 amir amir    1251787 сент. 16 03:08 test_kazan_netatmo.tsv
-rw-r--r-- 1 amir amir 2226797594 сент. 16 04:24 test_msk_features.tsv
-rw-r--r-- 1 amir amir   73498980 сент. 16 03:08 test_msk_netatmo.tsv
-rw-r--r-- 1 amir amir 2288135471 сент. 16 04:18 test_spb_features.tsv
-rw-r--r-- 1 amir amir   11756236 сент. 16 03:08 test_spb_netatmo.tsv
-rw-r--r-- 1 amir amir        732 сент. 16 02:43 train_col_dtypes.json
-rw-r--r-- 1 amir amir    5955016 сент. 16 03:08 train_kazan_netatmo.tsv
-rw-r--r-- 1 amir amir 7800656883 сент. 16 04:42 train_kazan.tsv
-rw-r--r-- 1 amir ami

In [None]:
!mkdir intermediate_data

In [None]:
!mkdir preprocessed_data

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import json
%matplotlib inline

pd.options.display.max_columns = 50

In [24]:
from sklearn.neighbors import KDTree
def preprocess_netatmo(df):
    """organizes netatmo stations into KDTrees for each distinct time frame"""
    
    df_by_hour = df.groupby('hour_hash')
    anns = {}
    for hour, stations_group in df_by_hour:
        anns[hour] = KDTree(stations_group[["netatmo_latitude","netatmo_longitude"]].values, metric='minkowski', p=2)
    
    # convert groupby to dict to get faster queries
    df_by_hour = {group:stations_group for group, stations_group in df_by_hour}
    
    return df_by_hour, anns

In [25]:
def extract_features_from_user(group, netatmo_groups, netatmo_anns):
    features = {}
    #square features
    square = {col: group[col].iloc[0] for col in group.columns}
    
    features['square_lat'] = square['sq_lat']
    features['square_lon'] = square['sq_lon']
    features['time_of_day'] = square['day_hour']
    
#     features['square_lat'] = square['sq_x']
#     features['square_lon'] = square['sq_y']
    
    # и запросы на ближ соседей по этим 
    
    #signal strength
    features['signal_mean'] = group['SignalStrength'].mean()
    features['signal_median'] = group['SignalStrength'].median()
    features['signal_var'] = group['SignalStrength'].var()

    #features for each user
    group_by_user = group.groupby('u_hashed')
    group_by_user.apply(lambda group: group['ulat'].var() + group['ulon'].var())
    
    features['num_users'] = len(group_by_user)
    features['mean_entries_per_user'] = group_by_user.apply(len).mean()
    features['var_of_entries_per_user'] = group_by_user.apply(len).var()
    features['mean_user_signal_var'] = group_by_user.apply(
        lambda user_entries: user_entries['SignalStrength'].var()).mean()
    
    #netatmo features
    if square['hour_hash'] in netatmo_groups:
        local_stations, neighbors = netatmo_groups[square['hour_hash']], netatmo_anns[square['hour_hash']]
        
        [distances], [neighbor_ids] = neighbors.query([(square['sq_lat'], square['sq_lon'])], k=50)

        neighbor_stations = local_stations.iloc[neighbor_ids]

        features['distance_to_closest_station'] = np.min(distances)
        features['mean_distance_to_station'] = np.mean(distances)
        
        features['distance_to_closest_station_pers_10'] = np.percentile(distances, 10)
        features['distance_to_closest_station_pers_20'] = np.percentile(distances, 20)
        features['distance_to_closest_station_pers_30'] = np.percentile(distances, 30)
        features['distance_to_closest_station_pers_50'] = np.percentile(distances, 50)
        features['distance_to_closest_station_pers_70'] = np.percentile(distances, 70)
        features['distance_to_closest_station_pers_80'] = np.percentile(distances, 80)
        
        features['distance_to_3_closest_station'] = np.mean(sorted(distances)[:5])
        
        cols_netatmo = ['netatmo_wind_direction_deg',
                    'netatmo_wind_gust_direction_deg',
                    'netatmo_wind_gust_speed_kmh',
                    'netatmo_wind_speed_kmh',
                    'netatmo_pressure_mbar',
                    'netatmo_sum_rain_1h',
                    'netatmo_sum_rain_24h',
                    'netatmo_humidity_percent',
                    'netatmo_temperature_c',]

        old_cols_netatmo = ['netatmo_pressure_mbar', 'netatmo_temperature_c', 
                            'netatmo_sum_rain_24h', 'netatmo_humidity_percent', 
                            "netatmo_wind_speed_kmh", "netatmo_wind_gust_speed_kmh"]
        for colname in cols_netatmo:
            col = neighbor_stations[colname].dropna()
            if len(col)!=0:
                features[colname+"_mean"], features[colname+"_median"], features[colname+"_std"], \
                features[colname+"_01"], features[colname+"_02"], features[colname+"_03"], \
                features[colname+"_04"], features[colname+"_065"], features[colname+"_08"] = \
                col.mean(), col.median(), col.var(),\
                col.quantile(q=0.1), col.quantile(q=0.2), \
                col.quantile(q=0.3), col.quantile(q=0.4), col.quantile(q=0.65), col.quantile(q=0.8)
            else:
                features[colname+"_mean"], features[colname+"_median"], features[colname+"_std"], \
                features[colname+"_01"], features[colname+"_02"], features[colname+"_03"], \
                features[colname+"_04"], features[colname+"_065"], features[colname+"_08"]  = \
                np.nan, np.nan, np.nan,  np.nan,  np.nan, np.nan, np.nan, np.nan, np.nan

    return features

In [26]:
for city in [77, 78, 16]:
    if city == 77:
        TRAIN_PATH = "../data/train_msk.tsv"
        NETATMO_PATH = "../data/train_msk_netatmo.tsv"
        TEST_PATH = "../data/test_msk_features.tsv"
        TEST_NETATMO_PATH = "../data/test_msk_netatmo.tsv"

        CITY_PREDICTIONS_PATH = "./intermediate_data/prediction_msk.csv"
    if city == 78:
        TRAIN_PATH = "../data/train_spb.tsv"
        NETATMO_PATH = "../data/train_spb_netatmo.tsv"
        TEST_PATH = "../data/test_spb_features.tsv"
        TEST_NETATMO_PATH = "../data/test_spb_netatmo.tsv"

        CITY_PREDICTIONS_PATH = "./intermediate_data/prediction_spb.csv"
    if city == 16:
        TRAIN_PATH = "../data/train_kazan.tsv"
        NETATMO_PATH = "../data/train_kazan_netatmo.tsv"
        TEST_PATH = "../data/test_kazan_features.tsv"
        TEST_NETATMO_PATH = "../data/test_kazan_netatmo.tsv"

        CITY_PREDICTIONS_PATH = "./intermediate_data/prediction_kazan.csv"

    train = pd.read_csv(TRAIN_PATH, sep='\t',dtype=json.load(open("../data/train_col_dtypes.json")))

    netatmo_groups,netatmo_anns = preprocess_netatmo(pd.read_csv(NETATMO_PATH,na_values="None",
                                                                 sep='\t',dtype={'hour_hash':"uint64"}))

    from tqdm import tqdm
    groupby = train.groupby(["city_code","sq_x","sq_y","hour_hash"])

    X, y, block_ids = [], [], []


    for block_id in tqdm(groupby.groups):
        group = groupby.get_group(block_id)
        X.append(extract_features_from_user(group, netatmo_groups, netatmo_anns))
        y.append(group.iloc[0]['rain'])
        block_ids.append(block_id+(group.iloc[0]["hours_since"],))

    X = pd.DataFrame(X).fillna(-999.)
    y = np.array(y)
    block_ids = pd.DataFrame(block_ids,columns=["city_code", "sq_x", "sq_y", "hour_hash", "hours_since"])

    test = pd.read_csv(TEST_PATH, sep='\t',dtype=json.load(open("../data/test_col_dtypes.json")),)
    test_groupby = test.groupby(["city_code","sq_x","sq_y","hour_hash"])
    test_netatmo_groups,test_netatmo_anns = preprocess_netatmo(pd.read_csv(TEST_NETATMO_PATH,na_values="None",
                                                                           sep='\t',dtype={'hour_hash':"uint64"}))

    X_test,test_block_ids = [],[]
    for block_id in tqdm(test_groupby.groups):
        group = test_groupby.get_group(block_id)
        X_test.append(extract_features_from_user(group,test_netatmo_groups,test_netatmo_anns))
        test_block_ids.append(block_id)

    X_test = pd.DataFrame(X_test)
    test_block_ids = pd.DataFrame(test_block_ids,columns=["city_code","sq_x","sq_y","hour_hash"])

    if city == 77:
        tmp1 = "./preprocessed_data/msk.csv"
        tmp2 = "./preprocessed_data/msk_test.csv"
        tmp3 = "./preprocessed_data/msk_block.csv"
        tmp4 = "./preprocessed_data/msk_block_test.csv"

    if city == 78:
        tmp1 = "./preprocessed_data/spb.csv"
        tmp2 = "./preprocessed_data/spb_test.csv"
        tmp3 = "./preprocessed_data/spb_block.csv"
        tmp4 = "./preprocessed_data/spb_block_test.csv"

    if city == 16:
        tmp1 = "./preprocessed_data/kazan.csv"
        tmp2 = "./preprocessed_data/kazan_test.csv"
        tmp3 = "./preprocessed_data/kazan_block.csv"
        tmp4 = "./preprocessed_data/kazan_block_test.csv"

    data = X.copy()
    data["target"] = y
    data.to_csv(tmp1)
    X_test.to_csv(tmp2)
    block_ids.to_csv(tmp3)
    test_block_ids.to_csv(tmp4)

100%|██████████| 45617/45617 [16:46<00:00, 45.30it/s]
100%|██████████| 14331/14331 [04:59<00:00, 62.20it/s]
100%|██████████| 32803/32803 [12:18<00:00, 44.41it/s]
100%|██████████| 10145/10145 [03:32<00:00, 47.67it/s]
100%|██████████| 46281/46281 [16:37<00:00, 46.41it/s]
100%|██████████| 14836/14836 [04:54<00:00, 50.33it/s]


In [27]:
!ls ./preprocessed_data

kazan_block.csv       kazan_test.csv	  msk.csv	 spb_block_test.csv
kazan_block_test.csv  msk_block.csv	  msk_test.csv	 spb.csv
kazan.csv	      msk_block_test.csv  spb_block.csv  spb_test.csv


### Загрузим препроцешенные данныые

In [28]:
import pandas as pd
import numpy as np

X_all = pd.concat(
    [pd.read_csv(fname) for fname in ("./preprocessed_data/msk.csv",
                                      "./preprocessed_data/spb.csv",
                                      "./preprocessed_data/kazan.csv")],
    ignore_index=True,
)

y = X_all['target']
X_all.drop(['Unnamed: 0', 'target'], axis=1, inplace=True)

X_all.shape

(124701, 100)

In [29]:
block_ids = pd.concat(
    [pd.read_csv(fname) for fname in ("./preprocessed_data/msk_block.csv",
                                      "./preprocessed_data/spb_block.csv",
                                      "./preprocessed_data/kazan_block.csv",
                                      )],
    ignore_index=True,
)
block_ids.drop(['Unnamed: 0'], axis=1, inplace=True)

In [30]:
X_all['city'] = block_ids['city_code']

In [32]:
list(X_all.columns).index('time_of_day'), list(X_all.columns).index('city')

(98, 100)

### обучим CatBoost

In [33]:
from catboost import CatBoostClassifier
# вставим (98, 100) 
model = CatBoostClassifier(learning_rate=0.02, iterations=109,
                           class_weights = [1, 2], random_seed = 52, 
                           eval_metric='AUC',).fit(X_all, y, 
                                                   verbose=True,
                                                   cat_features = [98, 100],)

Borders for float features generated
0:	learn 0.7706159466	total: 138ms	remaining: 14.9s
1:	learn 0.7804175346	total: 276ms	remaining: 14.8s
2:	learn 0.7851410808	total: 425ms	remaining: 15s
3:	learn 0.787155892	total: 580ms	remaining: 15.2s
4:	learn 0.7915334253	total: 754ms	remaining: 15.7s
5:	learn 0.7926808592	total: 905ms	remaining: 15.5s
6:	learn 0.7936131249	total: 1.07s	remaining: 15.6s
7:	learn 0.7949927839	total: 1.26s	remaining: 15.9s
8:	learn 0.7974880296	total: 1.46s	remaining: 16.2s
9:	learn 0.7993195087	total: 1.66s	remaining: 16.4s
10:	learn 0.7996597557	total: 1.81s	remaining: 16.2s
11:	learn 0.8009218291	total: 1.98s	remaining: 16s
12:	learn 0.8022546995	total: 2.16s	remaining: 16s
13:	learn 0.803227579	total: 2.33s	remaining: 15.8s
14:	learn 0.8033876718	total: 2.52s	remaining: 15.8s
15:	learn 0.8042911122	total: 2.71s	remaining: 15.8s
16:	learn 0.8046638723	total: 2.9s	remaining: 15.7s
17:	learn 0.8056323008	total: 3.1s	remaining: 15.7s
18:	learn 0.8059195851	total:

In [34]:
# kazan
X_test = pd.read_csv('./preprocessed_data/kazan_test.csv')
test_block_ids = pd.read_csv('./preprocessed_data/kazan_block_test.csv')
X_test.drop(['Unnamed: 0'], axis=1, inplace=True)
test_block_ids.drop(['Unnamed: 0'], axis=1, inplace=True)
# This code saves the prediction for one city.
prediction_for_one_city = test_block_ids.copy()
X_test['city'] = prediction_for_one_city['city_code']
prediction_for_one_city["prediction"] = model.predict_proba(X_test)[:,1]
prediction_for_one_city.to_csv('./intermediate_data/prediction_kazan.csv')

prediction_for_one_city.head()

Unnamed: 0,city_code,sq_x,sq_y,hour_hash,prediction
0,16,-29,-28,165421888901676174,0.121089
1,16,-29,-28,2204207480854218100,0.184451
2,16,-29,-28,3447428841816240483,0.182736
3,16,-29,-22,2204207480854218100,0.184451
4,16,-29,-22,2369809296117772715,0.142086


In [35]:
# msk 
X_test = pd.read_csv('./preprocessed_data/msk_test.csv')
test_block_ids = pd.read_csv('./preprocessed_data/msk_block_test.csv')
X_test.drop(['Unnamed: 0'], axis=1, inplace=True)
test_block_ids.drop(['Unnamed: 0'], axis=1, inplace=True)
X_test['city'] = prediction_for_one_city['city_code']
#This code saves the prediction for one city.
prediction_for_one_city = test_block_ids.copy()
prediction_for_one_city["prediction"] = model.predict_proba(X_test)[:,1]
prediction_for_one_city.to_csv('./intermediate_data/prediction_msk.csv')

prediction_for_one_city.head()

Unnamed: 0,city_code,sq_x,sq_y,hour_hash,prediction
0,77,-29,-20,10001350720559672051,0.133469
1,77,-29,-20,16943943785816561037,0.251952
2,77,-29,-13,6709826676016537280,0.097269
3,77,-29,-6,4995953818610197922,0.265422
4,77,-29,-6,6709826676016537280,0.178296


In [36]:
# spb
X_test = pd.read_csv('./preprocessed_data/spb_test.csv')
test_block_ids = pd.read_csv('./preprocessed_data/spb_block_test.csv')
X_test.drop(['Unnamed: 0'], axis=1, inplace=True)
test_block_ids.drop(['Unnamed: 0'], axis=1, inplace=True)
X_test['city'] = prediction_for_one_city['city_code']
#This code saves the prediction for one city.
prediction_for_one_city = test_block_ids.copy()
prediction_for_one_city["prediction"] = model.predict_proba(X_test)[:,1]
prediction_for_one_city.to_csv('./intermediate_data/prediction_spb.csv')

prediction_for_one_city.head()

Unnamed: 0,city_code,sq_x,sq_y,hour_hash,prediction
0,78,-29,18,11749216312848287879,0.100555
1,78,-28,-29,3349487805510353524,0.095179
2,78,-28,-11,2557319126692190455,0.128598
3,78,-28,18,11749216312848287879,0.09878
4,78,-27,-24,2557319126692190455,0.128796


### Merge

In [48]:
import pandas as pd

predictions = pd.concat(
    [pd.read_csv(fname,index_col=0) for fname in ("./intermediate_data/prediction_kazan.csv",
                                                  "./intermediate_data/prediction_spb.csv",
                                                  "./intermediate_data/prediction_msk.csv")],
    ignore_index=True
)
blocks = pd.read_csv("../data/hackathon_tosubmit.tsv",sep='\t')
assert len(predictions) == len(blocks),"Predictions don't match blocks. Sumbit at your own risk."

merged = pd.merge(blocks,predictions,how='left',on=["sq_x","sq_y","hour_hash"])
assert not np.isnan(merged.prediction).any(), "some predictions are missing. Sumbit at your own risk."

In [49]:
merged[['id','prediction']].to_csv("./intermediate_data/amir_78.csv",sep=',',index=False,header=False)

In [50]:
merged.head()

Unnamed: 0,hour_hash,id,sq_x,sq_y,city_code,prediction
0,67746694261088370,1,-21,25,77,0.190149
1,67746694261088370,2,-19,25,77,0.190747
2,67746694261088370,3,-16,23,77,0.174109
3,67746694261088370,4,-15,22,77,0.173733
4,67746694261088370,5,-15,24,77,0.171


### Загрузим предсказания моего тиммейта и взвесим их с нашим решением

In [51]:
ii_catboost = pd.read_csv('./intermediate_data/try18_iii_all.csv', header=None)
ii_catboost = ii_catboost[1] 

In [52]:
ii_xgbboost = pd.read_csv('./intermediate_data/try21_iiii_all_xgb.csv', header=None)
ii_xgbboost = ii_xgbboost[1] 

In [53]:
ii_cat2boost = pd.read_csv('./intermediate_data/try25_iii_catboost_new_features.csv', header=None)
ii_cat2boost = ii_cat2boost[1] 

In [54]:

merged.prediction = (merged.prediction * 3 +\
                     (ii_catboost * 1.25 + ii_cat2boost * 1.25)  +\
                     ii_xgbboost * 2) 


In [55]:
merged[['id','prediction']].to_csv("./intermediate_data/submit_a+i+i+i_BEST_EVER.csv",sep=',',index=False,header=False)

In [56]:
merged.head()

Unnamed: 0,hour_hash,id,sq_x,sq_y,city_code,prediction
0,67746694261088370,1,-21,25,77,0.924523
1,67746694261088370,2,-19,25,77,0.925933
2,67746694261088370,3,-16,23,77,0.88249
3,67746694261088370,4,-15,22,77,0.879246
4,67746694261088370,5,-15,24,77,0.864905
