In [12]:
import numpy as np
import pandas as pd

In [14]:
train_df = pd.read_csv('credit_scoring_train.csv', index_col='client_id')

In [16]:
test_df = pd.read_csv('credit_scoring_test.csv', index_col='client_id')

In [17]:
y = train_df['Delinquent90']

In [18]:
y

client_id
0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       0
29       0
        ..
74970    0
74971    0
74972    0
74973    0
74974    0
74975    0
74976    0
74977    0
74978    0
74979    0
74980    0
74981    0
74982    0
74983    0
74984    0
74985    0
74986    0
74987    0
74988    0
74989    0
74990    0
74991    0
74992    0
74993    0
74994    0
74995    0
74996    0
74997    0
74998    0
74999    0
Name: Delinquent90, Length: 75000, dtype: int64

In [19]:
train_df.drop('Delinquent90', axis=1, inplace=True)

In [21]:
train_df.head()

Unnamed: 0_level_0,DIR,Age,NumLoans,NumRealEstateLoans,NumDependents,Num30-59Delinquencies,Num60-89Delinquencies,Income,BalanceToCreditLimit
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.496289,49.1,13,0,0.0,2,0,5298.360639,0.387028
1,0.433567,48.0,9,2,2.0,1,0,6008.056256,0.234679
2,2206.731199,55.5,21,1,,1,0,,0.348227
3,886.132793,55.3,3,0,0.0,0,0,,0.97193
4,0.0,52.3,1,0,0.0,0,0,2504.613105,1.00435


In [22]:
train_df['NumDependents'].fillna(train_df['NumDependents'].median(), inplace=True)
train_df['Income'].fillna(train_df['Income'].median(), inplace=True)
test_df['NumDependents'].fillna(test_df['NumDependents'].median(), inplace=True)
test_df['Income'].fillna(test_df['Income'].median(), inplace=True)

In [24]:
from sklearn.tree import DecisionTreeClassifier

In [25]:
first_tree = DecisionTreeClassifier(max_depth=3, random_state=17)
first_tree.fit(train_df, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=17, splitter='best')

In [26]:
first_tree_pred = first_tree.predict(test_df)

In [27]:
def write_to_submission_file(predicted_labels, out_file,
                             target='Delinquent90', index_label="client_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(75000, 
                                                  predicted_labels.shape[0] + 75000),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [28]:
write_to_submission_file(first_tree_pred, 'credit_scoring_first_tree.csv')

In [29]:
first_tree_pred_probability = first_tree.predict_proba(test_df)[:, 1]

In [30]:
write_to_submission_file(first_tree_pred_probability, 'credit_scoring_first_tree.csv')

In [32]:
from sklearn.model_selection import GridSearchCV

In [34]:
%%time
tree_params = {'max_depth': list(range(3, 8)), 
               'min_samples_leaf': list(range(5, 13))}

locally_best_tree = GridSearchCV(first_tree, tree_params, cv = 5, n_jobs=-1)
locally_best_tree.fit(train_df, y)

CPU times: user 2.15 s, sys: 120 ms, total: 2.27 s
Wall time: 27.9 s


In [35]:
locally_best_tree.best_params_, round(locally_best_tree.best_score_, 3)

({'max_depth': 5, 'min_samples_leaf': 11}, 0.934)

In [36]:
tuned_tree_pred_probability = locally_best_tree.predict(test_df)
tuned_tree_pred_probability

array([0, 0, 0, ..., 0, 0, 0])

In [37]:
write_to_submission_file(tuned_tree_pred_probability, 'grid_search_tree.csv')

In [39]:
!ls -l *.csv

-rw-rw-r-- 1 kamilla kamilla  2096597 Мау 17 18:40 adult_test.csv
-rw-rw-r-- 1 kamilla kamilla  3835686 Мау 17 18:40 adult_train.csv
-rw-rw-r-- 1 kamilla kamilla    31107 Мау 17 18:40 beauty.csv
-rw-r--r-- 1 kamilla kamilla  2300412 Шіл  3 03:42 benchmark1.csv
-rw-r--r-- 1 kamilla kamilla  2046298 Шіл  5 17:27 credit_scoring_first_tree.csv
-rw-rw-r-- 1 kamilla kamilla  1812928 Мау 17 18:40 credit_scoring_sample.csv
-rw-rw-r-- 1 kamilla kamilla  3859157 Мау 17 18:40 credit_scoring_test.csv
-rw-rw-r-- 1 kamilla kamilla  3948401 Мау 17 18:40 credit_scoring_train.csv
-rw-rw-r-- 1 kamilla kamilla 88582391 Жел  1  2018 crypto-markets.csv
-rw-r--r-- 1 kamilla kamilla  2997180 Мау 21 09:44 diam.csv
-rw-r--r-- 1 kamilla kamilla   650023 Шіл  5 17:30 grid_search_tree.csv
-rw-rw-r-- 1 kamilla kamilla   813469 Шіл  5 10:28 howpop_test.csv
-rw-rw-r-- 1 kamilla kamilla 34032627 Шіл  5 10:29 howpop_train.csv
-rw-r--r-- 1 kamilla kamilla   451405 Мау 21 09:50 h_test.csv
-rw-r--r-- 1 kami

In [40]:
from sklearn.ensemble import RandomForestClassifier

In [41]:
first_forest = RandomForestClassifier(random_state=17)
first_forest.fit(train_df, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=17, verbose=0,
                       warm_start=False)

In [42]:
first_forest_pred = first_forest.predict(test_df)
first_forest_pred

array([0, 1, 0, ..., 0, 0, 0])

In [43]:
write_to_submission_file(first_forest_pred, 'forest_w_maxdepth.csv')

In [44]:
forest_params = {'max_features': np.linspace(.3, 1, 7)}
locally_best_forest = GridSearchCV(first_forest,forest_params, n_jobs=-1, cv = 5)
locally_best_forest.fit(train_df, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=10, n_jobs=None,
                                              oob_score=False, random_state=17,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_fe

In [45]:
locally_best_forest.best_params_, round(locally_best_forest.best_score_, 3)

({'max_features': 0.3}, 0.932)

In [46]:
tuned_forest_pred = locally_best_forest.predict(test_df)

In [47]:
write_to_submission_file(tuned_forest_pred, 'tuned_forest_pred.csv')

In [48]:
!ls -l *.csv

-rw-rw-r-- 1 kamilla kamilla  2096597 Мау 17 18:40 adult_test.csv
-rw-rw-r-- 1 kamilla kamilla  3835686 Мау 17 18:40 adult_train.csv
-rw-rw-r-- 1 kamilla kamilla    31107 Мау 17 18:40 beauty.csv
-rw-r--r-- 1 kamilla kamilla  2300412 Шіл  3 03:42 benchmark1.csv
-rw-r--r-- 1 kamilla kamilla  2046298 Шіл  5 17:27 credit_scoring_first_tree.csv
-rw-rw-r-- 1 kamilla kamilla  1812928 Мау 17 18:40 credit_scoring_sample.csv
-rw-rw-r-- 1 kamilla kamilla  3859157 Мау 17 18:40 credit_scoring_test.csv
-rw-rw-r-- 1 kamilla kamilla  3948401 Мау 17 18:40 credit_scoring_train.csv
-rw-rw-r-- 1 kamilla kamilla 88582391 Жел  1  2018 crypto-markets.csv
-rw-r--r-- 1 kamilla kamilla  2997180 Мау 21 09:44 diam.csv
-rw-r--r-- 1 kamilla kamilla   650023 Шіл  5 17:31 forest_w_maxdepth.csv
-rw-r--r-- 1 kamilla kamilla   650023 Шіл  5 17:30 grid_search_tree.csv
-rw-rw-r-- 1 kamilla kamilla   813469 Шіл  5 10:28 howpop_test.csv
-rw-rw-r-- 1 kamilla kamilla 34032627 Шіл  5 10:29 howpop_train.csv
-rw-r-

In [49]:
pd.DataFrame(locally_best_forest.best_estimator_.feature_importances_)

Unnamed: 0,0
0,0.166343
1,0.161029
2,0.093172
3,0.033461
4,0.041417
5,0.058877
6,0.062481
7,0.150709
8,0.232511


In [50]:
final_forest = RandomForestClassifier(n_estimators=300, max_features = 3, random_state=42,n_jobs=-1, oob_score=True) # Ваш код здесь
final_forest.fit(train_df, y)
final_forest_pred = final_forest.predict_proba(test_df)[:, 1]
write_to_submission_file(final_forest_pred, 'credit_scoring_final_forest.csv')

In [51]:
!ls -l *.csv

-rw-rw-r-- 1 kamilla kamilla  2096597 Мау 17 18:40 adult_test.csv
-rw-rw-r-- 1 kamilla kamilla  3835686 Мау 17 18:40 adult_train.csv
-rw-rw-r-- 1 kamilla kamilla    31107 Мау 17 18:40 beauty.csv
-rw-r--r-- 1 kamilla kamilla  2300412 Шіл  3 03:42 benchmark1.csv
-rw-r--r-- 1 kamilla kamilla  1575121 Шіл  5 17:34 credit_scoring_final_forest.csv
-rw-r--r-- 1 kamilla kamilla  2046298 Шіл  5 17:27 credit_scoring_first_tree.csv
-rw-rw-r-- 1 kamilla kamilla  1812928 Мау 17 18:40 credit_scoring_sample.csv
-rw-rw-r-- 1 kamilla kamilla  3859157 Мау 17 18:40 credit_scoring_test.csv
-rw-rw-r-- 1 kamilla kamilla  3948401 Мау 17 18:40 credit_scoring_train.csv
-rw-rw-r-- 1 kamilla kamilla 88582391 Жел  1  2018 crypto-markets.csv
-rw-r--r-- 1 kamilla kamilla  2997180 Мау 21 09:44 diam.csv
-rw-r--r-- 1 kamilla kamilla   650023 Шіл  5 17:31 forest_w_maxdepth.csv
-rw-r--r-- 1 kamilla kamilla   650023 Шіл  5 17:30 grid_search_tree.csv
-rw-rw-r-- 1 kamilla kamilla   813469 Шіл  5 10:28 howpop_t