### Filtering features from Low Variance, High Correlation and Empty Values Columns

In [1]:
import sys
sys.path.append('../src/')
import filter_features as ff
import pandas as pd

In [7]:
df = pd.read_parquet('../data/processed/train_features.parquet')
df.shape

(213451, 725)

In [4]:
empty_columns = ff.get_empty_columns(df, threshold=0.999)
len(empty_columns)

100%|██████████| 725/725 [00:01<00:00, 688.42it/s]


155

### 1. Getting and loading Feature Importance file

In [3]:
model_path = '/home/jovyan/projects/kaggle-airbnb/models/model_2021_09_06_14_22_11.cbm'
x_pickle_path = '/home/jovyan/projects/kaggle-airbnb/data/processed/x_test.pickle'
y_pickle_path = '/home/jovyan/projects/kaggle-airbnb/data/processed/y_test.pickle'
cat_features = [
    'gender',
    'signup_method',
    'signup_flow',
    'language',
    'affiliate_channel',
    'affiliate_provider',
    'first_affiliate_tracked',
    'signup_app',
    'first_device_type',
    'first_browser',
    'dow_registered',
    'hr_registered',
    'age_group',
    'dow_registered',
    'day_registered',
    'month_registered',
    'year_registered',
]
save_to = '/home/jovyan/projects/kaggle-airbnb/data/processed/features_importance.csv'

In [4]:
ff.generate_feature_importance_file(model_path, x_pickle_path, y_pickle_path, cat_features, save_to)

In [5]:
fi = ff.get_normalized_feature_weights(save_to)
fi = [{'feature_name': k, 'weight': v} for k, v in fi.items()]
fi = pd.DataFrame(fi)
fi['w_sum'] = fi.weight.cumsum()
fi.shape

(722, 3)

In [8]:
fi.head(10)

Unnamed: 0,feature_name,weight,w_sum
0,age,0.346725,0.346725
1,ai_pending_booking_request_pending,0.104108,0.450833
2,age_group,0.05028,0.501113
3,signup_method,0.033054,0.534167
4,first_affiliate_tracked,0.028491,0.562658
5,month_registered,0.027786,0.590445
6,gender,0.027434,0.617878
7,signup_flow,0.02427,0.642149
8,year_registered,0.023314,0.665463
9,count_tos_confirm_-unknown-_-unknown-,0.022732,0.688195


In [7]:
fi[fi.feature_name == 'gender']

Unnamed: 0,feature_name,weight,w_sum
6,gender,0.027434,0.617878


### 1.1 If we remove empty_cols columns we would lose approximately 1.4% of feature importance

In [9]:
fi[fi.feature_name.isin(empty_columns)].weight.sum()

0.00014591925762891996

### 2. Generating Low variance columns

In [10]:
low_variance_columns = ff.get_low_variance_columns(df, threshold=0.0001)
len(low_variance_columns)

49

In [11]:
fi[fi.feature_name.isin(low_variance_columns)].weight.sum()

0.006943689345756206

In [12]:
remove_columns = set(empty_columns).union(set(low_variance_columns))
len(remove_columns)

186

In [13]:
'country_destination' in remove_columns

False

### 3. Generating highly correlated columns set

In [14]:
fi_weights = ff.get_normalized_feature_weights(save_to)
len(fi_weights), fi_weights['age']

(722, 0.3467250418657691)

In [15]:
df.drop(remove_columns, axis=1, inplace=True)
df.shape

(213451, 539)

In [16]:
cdf = ff.get_correlated_features_pairs(df, fi_weights, threshold=0.95)
cdf.shape

100%|██████████| 528/528 [00:18<00:00, 28.84it/s]


(174, 3)

In [17]:
cdf.head()

Unnamed: 0,col1,col2,coef
135,count_acculynk_load_pin_pad_-unknown-_-unknown-,count_acculynk_pin_pad_inactive_-unknown-_-unk...,1.0
143,count_acculynk_pin_pad_inactive_-unknown-_-unk...,count_acculynk_session_obtained_-unknown-_-unk...,1.0
137,count_acculynk_pin_pad_inactive_-unknown-_-unk...,count_acculynk_bin_check_success_-unknown-_-un...,1.0
16,ai_open_graph_setting_-unknown-_-unknown-,ai_click_click_contact_host,1.0
13,ai_domains_-unknown-_-unknown-,ai_index_view_listing_descriptions,1.0


In [18]:
ccs = ff.get_connected_components(cdf)
len(ccs)

11

In [19]:
ccs[-5]

['age', 'age_group']

In [20]:
corr = abs(df.corr())

In [21]:
ix = -5
corr[ccs[ix]][corr.index.isin(ccs[ix]) | corr.index.isin(ccs[ix])]

Unnamed: 0,age,age_group
age,1.0,0.988654
age_group,0.988654,1.0


In [25]:
fi[fi.feature_name.isin(ccs[ix])].head()

Unnamed: 0,feature_name,weight,w_sum
186,count_coupon_field_focus_click_coupon_field_focus,2.1e-05,0.999797
342,count_apply_coupon_click_click_apply_coupon_click,0.0,1.0
352,count_apply_coupon_error_type_-unknown-_-unknown-,0.0,1.0
391,count_apply_coupon_error_click_apply_coupon_error,0.0,1.0


In [27]:
# len(highly_correlated_features)

In [22]:
highly_correlated_features = ff.get_highly_correlated_removal_candidates(ccs, fi_weights, verbose=1)

started selection of removal candidates:
ccs and fi_weights sizes: (11, 722)
removing all but first in each group:

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
count_acculynk_pin_pad_inactive_-unknown-_-unknown-
count_acculynk_bin_check_success_-unknown-_-unknown-
count_acculynk_load_pin_pad_-unknown-_-unknown-
count_acculynk_session_obtained_-unknown-_-unknown-
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ai_login_modal_view_login_modal
ai_new_view_list_your_space
ai_qt_reply_v2_submit_send_message
ai_click_click_book_it
ai_similar_listings_v2_nan_nan
ai_click_click_contact_host
ai_payment_instruments_data_payment_instruments
size
ai_faq_-unknown-_-unknown-
ai_index_view_listing_descriptions
ai_notifications_data_notifications
ai_impressions_view_p4
seassion_length
ai_update_submit_update_listing_description
ai_domains_-unknown-_-unknown-
ai_complete_redirect_-unknown-_-unknown-
ai_signup_login_view_signup_login_page
ai_referrer_status_-unknown-_-unknown-
ai_agree_terms_check_-unknown-_-unknown-
ai_create_sub

In [23]:
remove_columns = remove_columns.union(set(highly_correlated_features))
len(remove_columns)

248

In [None]:
'country_destination' in remove_columns

NameError: name 'remove_columns' is not defined

### 4. Dropping filtered columns and saving data. Repeating same process for test set

In [27]:
remove_columns = list(set(remove_columns).intersection(set(df)))
df.drop(remove_columns, axis=1, inplace=True)
df.shape

(213451, 477)

In [28]:
test_df = pd.read_parquet('../data/processed/test_features.parquet')
test_df.shape

(62096, 725)

In [29]:
test_df.drop(remove_columns, axis=1, inplace=True)
test_df.shape

(62096, 663)

In [32]:
df.to_parquet('../data/processed/train_features_uncorr.parquet')

In [33]:
test_df.to_parquet('../data/processed/test_features_uncorr.parquet')

In [41]:
fi[fi.feature_name.str.startswith('ai_')].head(20)

Unnamed: 0,feature_name,weight,w_sum
1,ai_pending_booking_request_pending,0.104108,0.450833
18,ai_nan_message_post_message_post,0.005562,0.808378
19,ai_confirm_email_click_confirm_email_link,0.005545,0.813923
22,ai_update_submit_update_listing,0.004765,0.829002
23,ai_dashboard_view_dashboard,0.004585,0.833586
27,ai_show_nan_nan,0.003879,0.850703
29,ai_header_userpic_data_header_userpic,0.003653,0.85811
38,ai_manage_listing_view_manage_listing,0.002817,0.885617
42,ai_ajax_refresh_subtotal_click_change_trip_cha...,0.002489,0.896126
43,ai_requested_view_p5,0.002484,0.89861


In [2]:
users = pd.read_parquet('../data/processed/users.parquet')
users.shape

(275547, 16)

time: 377 ms (started: 2021-09-07 11:42:12 +00:00)


In [3]:
sessions = pd.read_parquet('../data/processed/sessions.parquet')
sessions.shape

(10533241, 7)

time: 2.64 s (started: 2021-09-07 11:42:13 +00:00)


In [5]:
df = pd.read_parquet('../data/processed/train_features.parquet')
df.shape

(213451, 725)

In [7]:
df.country_destination.value_counts(normalize=True)

NDF      0.583473
US       0.292226
other    0.047290
FR       0.023532
IT       0.013282
GB       0.010888
ES       0.010536
CA       0.006690
DE       0.004971
NL       0.003570
AU       0.002525
PT       0.001017
Name: country_destination, dtype: float64

In [None]:
df.country_destination.value_counts(dropna=False)

In [None]:
(NDF, US, other) - (Universum - (NDF, US, other))

In [4]:
df.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed,action_info
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0,lookup_nan_nan
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0,search_results_click_view_search_results
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0,lookup_nan_nan
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0,search_results_click_view_search_results
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0,lookup_nan_nan


In [None]:
'action_info' in list

In [3]:
df.country_destination.value_counts()

AttributeError: 'DataFrame' object has no attribute 'country_destination'

In [13]:
other_countries_ai = df[~df.country_destination.isin({'NDF', 'US', 'other'})].action_info.tolist()

AttributeError: 'DataFrame' object has no attribute 'action_info'