In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.metrics import f1_score, r2_score, precision_score, recall_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight

In [3]:
import xgboost as xgb, lightgbm as lgbm, catboost


def feature_predict(db_features, db_target):
    X_train, X_test, y_train, y_test = train_test_split(db_features,
                                                        db_target,  test_size=0.3, random_state=42)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    y_predict = lr.predict(X_test)
    yearsjob_predict = feature_predict(db_features,db_target)
    years_predict = (round(i) for i in yearsjob_predict)
    r2_score(y_test, years_predict)
    return r2_score

In [4]:
#download f1_score

In [5]:
bank_db = pd.read_csv('D:/AI/_Python/Python_for_DS/course_project_train.csv')

In [6]:
bank_db_test = pd.read_csv('D:/AI/_Python/Python_for_DS/course_project_test.csv')

In [7]:
bank_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
Home Ownership                  7500 non-null object
Annual Income                   5943 non-null float64
Years in current job            7129 non-null object
Tax Liens                       7500 non-null float64
Number of Open Accounts         7500 non-null float64
Years of Credit History         7500 non-null float64
Maximum Open Credit             7500 non-null float64
Number of Credit Problems       7500 non-null float64
Months since last delinquent    3419 non-null float64
Bankruptcies                    7486 non-null float64
Purpose                         7500 non-null object
Term                            7500 non-null object
Current Loan Amount             7500 non-null float64
Current Credit Balance          7500 non-null float64
Monthly Debt                    7500 non-null float64
Credit Score                    5943 non-null float64
Credit Default                  7

**выбросы**

In [8]:
bank_db.loc[bank_db['Current Loan Amount'].between(50000000,100000000),
            ['Current Loan Amount']] = bank_db['Current Loan Amount'].median() 

In [9]:
bank_db_test.loc[bank_db_test['Current Loan Amount'].between(50000000,100000000),
                 ['Current Loan Amount']] = bank_db_test['Current Loan Amount'].median() 


In [10]:
bank_db.loc[bank_db['Maximum Open Credit'] >50000000, 'Maximum Open Credit'] = 50000000
bank_db_test.loc[bank_db['Maximum Open Credit'] >50000000, 'Maximum Open Credit'] = 50000000

In [11]:
bank_db.loc[bank_db['Maximum Open Credit'] >40000000, :] 

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
44,Have Mortgage,1334256.0,10+ years,0.0,17.0,15.4,50000000.0,1.0,35.0,0.0,take a trip,Short Term,33110.0,656450.0,34357.0,729.0,0
617,Home Mortgage,,3 years,0.0,13.0,27.5,50000000.0,0.0,,0.0,debt consolidation,Short Term,529892.0,1126814.0,5492.0,,0
2023,Rent,1342825.0,10+ years,0.0,12.0,16.3,50000000.0,0.0,,0.0,other,Short Term,166826.0,944547.0,9176.0,749.0,0
2617,Home Mortgage,3217802.0,10+ years,0.0,12.0,35.5,50000000.0,0.0,,0.0,other,Short Term,50116.0,1834450.0,20272.0,750.0,0
2763,Home Mortgage,3377440.0,10+ years,0.0,11.0,23.2,40923894.0,0.0,59.0,0.0,home improvements,Short Term,309573.0,1908550.0,5770.0,743.0,0


In [12]:
bank_db['Term'].replace({'Short Term':1, 'Long Term':0}, inplace=True)
bank_db_test['Term'].replace({'Short Term':1, 'Long Term':0}, inplace=True)

In [13]:
bank_db['Home Ownership'].replace({'Home Mortgage':0, 'Rent':1,
                                   'Own Home':2,'Have Mortgage':3 }, inplace=True)
bank_db_test['Home Ownership'].replace({'Home Mortgage':0, 'Rent':1,
                                   'Own Home':2,'Have Mortgage':3 }, inplace=True)

**filling in nulls**

In [14]:
bank_db_test['Years in current job'].mode()

0    10+ years
dtype: object

In [15]:
# Years in current job
bank_db['Years in current job'] = bank_db['Years in current job'].fillna('10+ years')
bank_db_test['Years in current job'] = bank_db_test['Years in current job'].fillna('10+ years')

In [16]:
bank_db['Years in current job'].replace({'< 1 year': '0'}, inplace=True)
bank_db_test['Years in current job'].replace({'< 1 year': '0'}, inplace=True)

In [17]:
#extracting years to int
bank_db.loc[bank_db['Years in current job'].notnull()
            , ['Years in current job']] = bank_db['Years in current job'].str.findall('([0-9]+)')

for i in range(len(bank_db['Years in current job'])):
    if type(bank_db.loc[i, 'Years in current job'])== list:
        bank_db.loc[i, 'Years in current job'] = ''.join(bank_db.loc[(i), 'Years in current job'])

In [18]:
bank_db_test.loc[bank_db_test['Years in current job'].notnull()
            , ['Years in current job']] = bank_db_test['Years in current job'].str.findall('([0-9]+)')

for i in range(len(bank_db_test['Years in current job'])):
    if type(bank_db_test.loc[i, 'Years in current job'])== list:
        bank_db_test.loc[i, 'Years in current job'] = ''.join(bank_db_test.loc[(i), 'Years in current job'])

In [19]:
bank_db['Years in current job'].astype(int)
bank_db_test['Years in current job'].astype(int)

0        4
1        1
2        3
3       10
4        6
        ..
2495    10
2496     2
2497     2
2498    10
2499    10
Name: Years in current job, Length: 2500, dtype: int32

In [20]:
#Months since last delinquent
bank_db['Months since last delinquent'] = bank_db['Months since last delinquent'].fillna(-1)
bank_db_test['Months since last delinquent'] = bank_db_test['Months since last delinquent'].fillna(-1)

In [21]:
# Bankruptcies
bank_db['Bankruptcies'] = bank_db['Bankruptcies'].fillna(0)
bank_db_test['Bankruptcies'] = bank_db_test['Bankruptcies'].fillna(0)

In [22]:
# Annual income
numeric_features_nonnull = ['Years in current job', 'Number of Open Accounts', 'Years of Credit History', 'Maximum Open Credit', 'Number of Credit Problems',
                   'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt']

**trying prediction for Annual income nulls**

In [23]:
db_features = bank_db.loc[bank_db['Annual Income'].notnull(), numeric_features_nonnull]
db_target = pd.DataFrame(bank_db.loc[bank_db['Annual Income'].notnull(), 'Annual Income'])

In [24]:
X_train, X_test, y_train, y_test = train_test_split(db_features,
                                                        db_target,  test_size=0.3, random_state=42)

In [25]:
rfr_bank = rfr(max_depth=10, max_features=3, n_estimators=250, random_state=100)
#rfr_bank.fit(X_train_scaled, y_train.values.ravel())
rfr_bank.fit(X_train, y_train.values.ravel())

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features=3, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=250, n_jobs=None, oob_score=False,
                      random_state=100, verbose=0, warm_start=False)

In [26]:
#y_pred1 = rfr_bank.predict(X_test_scaled)
y_pred1 = rfr_bank.predict(X_test)

In [27]:
r2_score(y_test, y_pred1)

0.4021366183531164

In [28]:
#Annual Income nulls with median
bank_db['Annual Income'] = bank_db['Annual Income'].fillna(bank_db['Annual Income'].median())
bank_db_test['Annual Income'] = bank_db_test['Annual Income'].fillna(bank_db_test['Annual Income'].median())

**trying prediction for Credit Score with RFC**

In [29]:
#forming binary credit score in train and test
bank_db['Credit Score Bin'] = np.nan
bank_db.loc[(bank_db['Credit Score'] > 670) | (bank_db['Credit Score'] < 1000), 'Credit Score Bin'] = 0
bank_db.loc[(bank_db['Credit Score'] <= 670) | (bank_db['Credit Score'] >= 1000), 'Credit Score Bin'] = 1

bank_db_test['Credit Score Bin'] = np.nan
bank_db_test.loc[(bank_db_test['Credit Score'] > 670) | (bank_db_test['Credit Score'] < 1000), 'Credit Score Bin'] = 0
bank_db_test.loc[(bank_db_test['Credit Score'] <= 670) | (bank_db_test['Credit Score'] >= 1000), 'Credit Score Bin'] = 1

In [30]:
numeric_features_nonnull_2 = ['Annual Income', 'Number of Open Accounts', 'Years of Credit History', 
                              'Maximum Open Credit', 'Number of Credit Problems',
                              'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt']

In [31]:
#Credit Score in binary form as target
db_features_crsc = bank_db.loc[bank_db['Credit Score Bin'].notnull(), numeric_features_nonnull_2]
db_target_crsc = pd.DataFrame(bank_db.loc[bank_db['Credit Score Bin'].notnull(), 'Credit Score Bin'])

In [32]:
db_target_crsc['Credit Score Bin'].value_counts()

0.0    5176
1.0     767
Name: Credit Score Bin, dtype: int64

In [33]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(db_features_crsc, db_target_crsc, 
                                                    test_size=0.3, random_state=42)

**balancing classes in Credit Score Bin**

In [34]:
concat_to_balance = pd.concat([X_train_2, y_train_2], axis=1)

In [35]:
minor_class = concat_to_balance[concat_to_balance['Credit Score Bin'] == 1]
major_class = concat_to_balance[concat_to_balance['Credit Score Bin'] == 0]

In [36]:
len(minor_class)

558

In [37]:
len(major_class)

3602

In [38]:
#oversampling
minor_class_balanced = minor_class.copy()
for i in range(int(len(major_class)/len(minor_class)) * 4-1):
    sample = minor_class.sample(frac=0.25)
    minor_class_balanced = minor_class_balanced.append(sample, ignore_index=True)


In [39]:
df_oversampling = pd.concat([major_class, minor_class_balanced], axis=0, ignore_index=True, sort=False)
df_oversampling['Credit Score Bin'].value_counts()

1.0    3778
0.0    3602
Name: Credit Score Bin, dtype: int64

In [40]:
df_oversampling_x = df_oversampling.drop(columns='Credit Score Bin')
df_oversampling_y = df_oversampling['Credit Score Bin']

In [41]:
#RFC

In [42]:
rfc_bank = rfc(max_depth=10, max_features=3, n_estimators=250, random_state=100)
rfc_bank.fit(df_oversampling_x, df_oversampling_y.values.ravel())

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features=3,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=None, oob_score=False, random_state=100,
                       verbose=0, warm_start=False)

In [43]:
y_pred2 = rfc_bank.predict(X_test_2)

In [44]:
precision_score(y_test_2, y_pred2)

0.190625

In [45]:
f1_score(y_test_2, y_pred2)

0.23062381852551986

In [46]:
# filling in with a mode
bank_db['Credit Score Bin'] = bank_db['Credit Score Bin'].fillna(0)
bank_db_test['Credit Score Bin'] = bank_db_test['Credit Score Bin'].fillna(0)

In [47]:
bank_db['Credit Score Bin'] = bank_db['Credit Score Bin'].astype(int)
bank_db_test['Credit Score Bin'] = bank_db_test['Credit Score Bin'].astype(int)

In [48]:
bank_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 18 columns):
Home Ownership                  7500 non-null int64
Annual Income                   7500 non-null float64
Years in current job            7500 non-null object
Tax Liens                       7500 non-null float64
Number of Open Accounts         7500 non-null float64
Years of Credit History         7500 non-null float64
Maximum Open Credit             7500 non-null float64
Number of Credit Problems       7500 non-null float64
Months since last delinquent    7500 non-null float64
Bankruptcies                    7500 non-null float64
Purpose                         7500 non-null object
Term                            7500 non-null int64
Current Loan Amount             7500 non-null float64
Current Credit Balance          7500 non-null float64
Monthly Debt                    7500 non-null float64
Credit Score                    5943 non-null float64
Credit Default                  750

In [49]:
features_for_catboost_num = ['Annual Income', 'Years in current job', 'Number of Open Accounts',
                             'Years of Credit History', 'Maximum Open Credit',
                             'Number of Credit Problems', 'Months since last delinquent', 'Bankruptcies',
                             'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt']
features_for_catboost_cat = ['Home Ownership','Term', 'Credit Score Bin']

In [50]:
df_for_train = bank_db.drop(columns=['Tax Liens','Purpose','Credit Score'])
df_for_test = bank_db_test.drop(columns=['Tax Liens','Purpose','Credit Score'])

In [51]:
compute_class_weight('balanced', np.array([0, 1]), df_for_train['Credit Default'])

array([0.69612029, 1.77472788])

**train and valid**

In [52]:
X = df_for_train.drop(columns='Credit Default')
y = df_for_train['Credit Default']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, shuffle=True, test_size=0.3, random_state=21)

**catboost**

In [53]:
model_catboost = catboost.CatBoostClassifier(max_depth=4, class_weights=[0.7, 1.77], silent=True, random_state=21, cat_features=features_for_catboost_cat)
model_catboost.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x2817857bcc0>

In [54]:
y_train_pred = model_catboost.predict(X_train)
y_valid_predict = model_catboost.predict(X_valid)

In [55]:
 print('TRAIN\n\n' + classification_report(y_train, y_train_pred))

TRAIN

              precision    recall  f1-score   support

           0       0.90      0.81      0.85      3771
           1       0.61      0.78      0.69      1479

    accuracy                           0.80      5250
   macro avg       0.76      0.79      0.77      5250
weighted avg       0.82      0.80      0.81      5250



In [56]:
 print('VALID\n\n' + classification_report(y_valid, y_valid_predict))

VALID

              precision    recall  f1-score   support

           0       0.83      0.74      0.78      1616
           1       0.48      0.61      0.54       634

    accuracy                           0.70      2250
   macro avg       0.66      0.68      0.66      2250
weighted avg       0.73      0.70      0.71      2250



In [57]:
y_test_predict = model_catboost.predict(df_for_test)

In [58]:
bank_db_test['Credit Default'] = y_test_predict

In [59]:
bank_db_test.loc[:, 'Credit Default'].to_pickle('D:/AI/_Python/Python_for_DS/bank_predictions.pkl')