In [5]:
import pandas as pd
import numpy as np

%matplotlib inline

pd.set_option('mode.chained_assignment', None)
pd.options.display.max_rows = 100

In [6]:
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_percentage_error


def calc_all_metrics(data, max_account=25e3):
    
    def is_credit_issued(x):
        ratio = x['__price_predict'] / x['__price_doc']
        if x['__priority'] <= 0:
            value = 0.  
        elif ratio > 0.9 and ratio < 1.:
            value = x['__price_predict']
        elif ratio >= 1. and ratio < 1.1:
            value = x['__price_doc']
        else:
            value = 0.

        return value

    def calc_profit(x):
        if x['is_credit'] == 0.:
            return 0.
        elif x['__churn'] == 1:
            return - x['debt'] * 2.
        elif x['debt'] < 5:
            return x['debt'] * 0.3
        elif x['debt'] < 9:
            return x['debt'] * 0.4
        elif x['debt'] >= 9:
            return x['debt'] * 0.5

    s = (
        data
        [['__priority', '__churn', '__churn_prob', '__price_doc', '__price_predict']]
        .sort_values('__priority', ascending=False)
        .copy(True)
    )
    
    s['debt'] = s.apply(is_credit_issued, axis=1)
    s['debt_cum'] = s['debt'].cumsum()
    s['is_credit'] = 0
    s.loc[(s['debt'] > 0) & (s['debt_cum'] <= max_account), 'is_credit'] = 1
    s['profit'] = s.apply(calc_profit, axis=1)
    
    total_profit = round(s['profit'].sum(), 2)
    good_credits_count = s['is_credit'].sum()
    good_credits_debt = round(s[s['is_credit'] == 1]['debt'].sum(), 2)
    bad_credits_count = s[s['is_credit'] == 1]['__churn'].sum()
    bad_credits_losses = s[(s['is_credit'] == 1) & (s['__churn'] == 1)]['debt'].sum()
    
    return {
        'total_profit': total_profit,
        '%profit_issued': round(total_profit / good_credits_debt * 100, 1),
        '%issued_loans': round(good_credits_debt / max_account * 100, 2),
        'issued_loans': good_credits_debt,
        'count_good': good_credits_count,
        'count_bad': bad_credits_count,
        '%bad': round(bad_credits_count / (good_credits_count + bad_credits_count) * 100., 1),
        'churn_auc': round(roc_auc_score(y_true=s['__churn'], y_score=s['__churn_prob']), 3),
        'price_nmsle': round(-mean_squared_log_error(y_true=s['__price_doc'], y_pred=s['__price_predict']), 3),
        'price_mape': round(-mean_absolute_percentage_error(y_true=s['__price_doc'], y_pred=s['__price_predict']), 3),
    }


In [7]:
train_raw = pd.read_csv('raw_train.csv')
submission = pd.read_csv('raw_test.csv')

print(train_raw.shape, submission.shape)

(20483, 82) (9988, 86)


In [8]:
# Explore the dataset
print(train_raw.columns.tolist())


['max_floor', 'state', 'marital_status', 'big_market_raion', 'total_revolving_bal', 'market_count_1500', 'leisure_count_3000', 'total_ct_chng_q4_q1', 'water_1line', 'railroad_station_walk_km', 'culture_objects_top_25', 'contacts_count_12_mon', '0_17_all', 'trc_count_2000', 'product_type', 'build_count_wood', 'credit_limit', 'total_trans_ct', 'leisure_count_5000', 'life_sq', 'cafe_count_1000_price_1000', 'mkad_km', 'school_education_centers_top_20_raion', 'big_road1_1line', 'card_category', 'avg_utilization_ratio', 'public_transport_station_min_walk', 'income_category', 'customer_age', 'thermal_power_plant_raion', 'radiation_raion', 'detention_facility_km', 'sport_count_2000', 'cafe_sum_1000_min_price_avg', 'total_amt_chng_q4_q1', 'ecology', 'metro_km_walk', 'office_sqm_5000', 'gender', 'oil_chemistry_raion', 'nuclear_reactor_raion', 'total_trans_amt', 'months_inactive_12_mon', 'cafe_sum_1500_min_price_avg', 'railroad_1line', 'floor', 'num_room', 'timestamp', 'education_level', 'months_

In [9]:
# Explore the dataset
print(submission.columns.tolist())

['max_floor', 'state', 'marital_status', 'big_market_raion', 'total_revolving_bal', 'market_count_1500', 'leisure_count_3000', 'total_ct_chng_q4_q1', 'water_1line', 'railroad_station_walk_km', 'culture_objects_top_25', 'contacts_count_12_mon', '0_17_all', 'trc_count_2000', 'product_type', 'build_count_wood', 'credit_limit', 'total_trans_ct', 'leisure_count_5000', 'life_sq', 'cafe_count_1000_price_1000', 'mkad_km', 'school_education_centers_top_20_raion', 'big_road1_1line', 'card_category', 'avg_utilization_ratio', 'public_transport_station_min_walk', 'income_category', 'customer_age', 'thermal_power_plant_raion', 'radiation_raion', 'detention_facility_km', 'sport_count_2000', 'cafe_sum_1000_min_price_avg', 'total_amt_chng_q4_q1', 'ecology', 'metro_km_walk', 'office_sqm_5000', 'gender', 'oil_chemistry_raion', 'nuclear_reactor_raion', 'total_trans_amt', 'months_inactive_12_mon', 'cafe_sum_1500_min_price_avg', 'railroad_1line', 'floor', 'num_room', 'timestamp', 'education_level', 'months_

# 1. Fill your top features

In [82]:
reg_features = ['full_sq', 'sport_count_2000', 'office_sqm_5000', 'mkad_km', 'detention_facility_km', 'leisure_count_3000','state','leisure_count_5000'] # your top features for regression model
clf_features = ['contacts_count_12_mon', 'total_ct_chng_q4_q1', 'total_trans_ct', 'total_relationship_count', 'months_inactive_12_mon','total_revolving_bal','total_amt_chng_q4_q1','avg_utilization_ratio','avg_open_to_buy','credit_limit','f_total_ct_chng_q4_q1'] # your top features for classification model

print(len(reg_features), reg_features)
print(len(clf_features), clf_features)

8 ['full_sq', 'sport_count_2000', 'office_sqm_5000', 'mkad_km', 'detention_facility_km', 'leisure_count_3000', 'state', 'leisure_count_5000']
11 ['contacts_count_12_mon', 'total_ct_chng_q4_q1', 'total_trans_ct', 'total_relationship_count', 'months_inactive_12_mon', 'total_revolving_bal', 'total_amt_chng_q4_q1', 'avg_utilization_ratio', 'avg_open_to_buy', 'credit_limit', 'f_total_ct_chng_q4_q1']


In [83]:

X = train_raw.fillna(0).copy(True)
X_sub = submission.fillna(0).copy(True)

# 2. Fill your best models and improve alg1

In [103]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier


def alg1(x, threshold=0.10):
    """
    Threshold-based algorithm to significantly reduce the percentage of bad loans
    """
    
    if x['__churn_prob'] > threshold:
        return 0
    return x['__price_predict'] * (1 - x['__churn_prob']) * (1 + x['__price_predict'] / 100)

reg = GradientBoostingRegressor(max_depth=20, random_state=47)
clf = GradientBoostingClassifier(n_estimators=200, random_state=47)

In [104]:
from sklearn.model_selection import cross_val_predict


X = train_raw.fillna(0).copy(True)
X_sub = submission.fillna(0).copy(True)

train_raw['__price_predict'] = cross_val_predict(
    estimator=reg,
    X=X[reg_features],
    y=train_raw['__price_doc'],
    cv=5,
    method='predict')
train_raw['__price_predict'] = np.maximum(0, train_raw['__price_predict'])

train_raw['__churn_prob'] = cross_val_predict(
    estimator=clf,
    X=X[clf_features],
    y=train_raw['__churn'],
    cv=5,
    method='predict_proba')[:, 1]

train_raw['__priority'] = train_raw.apply(alg1, axis=1)

In [None]:
##submission

# 3. Calculate quality metrics

In [105]:
from sklearn.model_selection import train_test_split

complex_stratify = (
    pd.qcut(train_raw['__price_doc'], q=10).astype(str)
    + train_raw['__churn'].astype(str))

# Split data and calculate metrics
test1, test2 = train_test_split(train_raw, test_size=0.5, stratify=complex_stratify, random_state=47)

metrics = pd.DataFrame(data=[calc_all_metrics(test1), calc_all_metrics(test2)], index=['test1', 'test2']).T
print(metrics)


                    test1      test2
total_profit     7925.770   7851.150
%profit_issued     37.400     36.800
%issued_loans      84.720     85.360
issued_loans    21179.510  21339.190
count_good       3357.000   3366.000
count_bad          43.000     48.000
%bad                1.300      1.400
churn_auc           0.967      0.966
price_nmsle        -0.186     -0.197
price_mape         -0.490     -0.523


In [61]:
#submission

In [None]:
churn_auc >= 0.92
price_nmsle >= -0.142
issue_amount >= 22000
total_profit >= 16000
%bad_loans <= 1.4


//// 

Minimum requirements
- churn_auc >= 0.92
- price_nmsle >= -0.142
- total_profit >= 4000
- %bad_loans <= 1.4



# 4. Prepare submission

In [106]:
reg.fit(X[reg_features], train_raw['__price_doc'])
submission['__price_predict'] = np.maximum(0, reg.predict(X_sub[reg_features]))

clf.fit(X[clf_features], train_raw['__churn'])
submission['__churn_prob'] = clf.predict_proba(X_sub[clf_features])[:, 1]
submission['__priority'] = submission.apply(alg1, axis=1) # Applied the 'alg1' algorithm

In [107]:
final_score = submission[['__price_predict', '__churn_prob', '__priority']]
print(final_score.columns.tolist())
print(final_score.shape)

assert final_score.shape == (9988, 3)
assert final_score.columns.tolist() == ['__price_predict', '__churn_prob', '__priority']
assert final_score['__price_predict'].min() >= 0.
assert len(final_score['__churn_prob'].unique()) > 2

final_score.to_csv('5-Albert_Dovlo.csv', index=False)
final_score.head()

['__price_predict', '__churn_prob', '__priority']
(9988, 3)


Unnamed: 0,__price_predict,__churn_prob,__priority
0,9.747781,0.96304,0.0
1,5.256278,0.003944,5.51074
2,5.928853,0.027174,6.109706
3,4.104147,0.002241,4.26301
4,6.7276,0.001202,7.171577


# 5. All your experiments going below