In [17]:
import pandas as pd
import numpy as np
import seaborn as sns

from collections import Counter

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline

from lightgbm import LGBMClassifier

In [57]:
app_train = pd.read_csv('home-credit-default-risk/application_train.csv')
app_test = pd.read_csv('home-credit-default-risk/application_test.csv')
Y = app_train['TARGET']
# app_train.drop(columns=['TARGET'], inplace=True)

In [3]:
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
print('train: ', app_train.shape)
print('test: ', app_test.shape)

train:  (307511, 121)
test:  (48744, 121)


In [58]:
app_train.dtypes.value_counts()

float64    65
int64      41
object     16
dtype: int64

16 категориальных признаков

In [59]:
categorical_features = app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)
# numerical_app_train

In [60]:
def detect_outliers(df, n, features):
    outlier_indices = []
    # итерируемся по фичам
    for col in features:
        # 1-й квартиль (25%)
        q1 = np.percentile(df[col], 25)
        # 3-й квартиль (75%)
        q3 = np.percentile(df[col],75)
        # межквартильный размах (IQR)
        iqr = q3 - q1
        outlier_step = 1.5 * iqr
        # создадим список индексов выбросов для признака col
        outlier_list_col = df[(df[col] < q1 - outlier_step) | (df[col] > q3 + outlier_step )].index
        outlier_indices.extend(outlier_list_col)
    # отбор индексов по количеству выбросов
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    return multiple_outliers

outliers_to_drop = detect_outliers(app_train, 7, app_train.select_dtypes(['float64', 'int64']).columns)

In [61]:
app_train.drop(index=outliers_to_drop, inplace=True)

In [62]:
# Функция для подсчета недостающих столбцов
def missing_values_table(df):
    
        # Всего недостает
        mis_val = df.isnull().sum()
        
        # Процент недостающих данных
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Таблица с результатами
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Переименование столбцов
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Сортировка про процентажу
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Инфо
        print ("В выбранном датафрейме " + str(df.shape[1]) + " столбцов.\n"      
            "Всего " + str(mis_val_table_ren_columns.shape[0]) +
              " столбцов с неполными данными.")
        
        # Возврат таблицы с данными
        return mis_val_table_ren_columns

In [63]:
missing_values = missing_values_table(app_train)


В выбранном датафрейме 122 столбцов.
Всего 67 столбцов с неполными данными.


In [64]:
app_train[missing_values.index].dtypes.value_counts()

float64    61
object      6
dtype: int64

В тренеровочном датасете 61 стобец числовых переменных с пропусками
6 столбцов с пропусками по категориальным переменным

In [65]:
app_train[missing_values.index].select_dtypes('object').isnull().sum()

FONDKAPREMONT_MODE     209094
WALLSMATERIAL_MODE     155462
HOUSETYPE_MODE         153430
EMERGENCYSTATE_MODE    144936
OCCUPATION_TYPE         95911
NAME_TYPE_SUITE          1270
dtype: int64

In [66]:
# app_train[missing_values.index].select_dtypes('float64').isnull().sum()

In [67]:
app_train.shape, app_test.shape

((305512, 122), (48744, 121))

In [45]:
# dataset = pd.concat([app_train, app_test])

Признаки с 2 различными значениями закодируем с помощью LabelEncoder.
Остальные признаки закодируем с помощью OneHotEncoder.

In [68]:
le = LabelEncoder()

for col in categorical_features[categorical_features==2].index:
    app_train[col] = le.fit_transform(app_train[col].astype('str'))
    app_test[col] = le.fit_transform(app_test[col].astype('str'))
#     dataset[col] = le.fit_transform(dataset[col].astype('str'))

In [69]:
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)
# dataset = pd.get_dummies(dataset)

In [70]:
# Выравнивание - сохранятся только столбцы. имеющиеся в обоих датафреймах
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
# app_train = dataset.loc[:app_train.shape[0], :]
# app_test = dataset.loc[app_train.shape[0]:, :]
print('Формат тренировочной выборки: ', app_train.shape)
print('Формат тестовой выборки: ', app_test.shape)


Формат тренировочной выборки:  (305512, 238)
Формат тестовой выборки:  (48744, 238)


In [71]:
# imputer = KNNImputer()
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

In [72]:
imputer.fit(app_train)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)

In [73]:
train = imputer.transform(app_train)
test = imputer.transform(app_test)

In [74]:
scaler.fit(train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [75]:
train = scaler.transform(train)
test = scaler.transform(test)

In [80]:
Y.drop(index=outliers_to_drop, inplace=True)

KeyError: '[   323    416    536 ... 306710 306719 307002] not found in axis'

In [81]:
from sklearn.linear_model import LogisticRegression

# Создаем модель
log_reg = LogisticRegression(C = 0.0001)

# Тренируем модель
log_reg.fit(train, Y)

log_reg_pred = log_reg.predict_proba(test)[:, 1]

submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred

submit.to_csv('log_reg_baseline.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.054079
1,100005,0.209407
2,100013,0.059725
3,100028,0.049038
4,100038,0.138323


In [26]:
from sklearn.ensemble import RandomForestClassifier

# Создадим классификатор
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50)

# Тренировка на тернировочных данных
random_forest.fit(train, Y)

# Предсказание на тестовых данных
predictions = random_forest.predict_proba(test)[:, 1]

# Создание датафрейма для загрузки
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = predictions

# Сохранение
submit.to_csv('random_forest_baseline.csv', index = False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [28]:
from sklearn.ensemble import GradientBoostingClassifier

grad_boost = GradientBoostingClassifier(random_state=50)

grad_boost.fit(train, Y)

# Предсказание на тестовых данных
predictions = grad_boost.predict_proba(test)[:, 1]

# Создание датафрейма для загрузки
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = predictions

# Сохранение
submit.to_csv('grad_boosting_baseline.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [33]:
lgbm = LGBMClassifier(random_state=50)

lgbm.fit(train, Y)

predictions = lgbm.predict_proba(test)[:, 1]

submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = predictions

# Сохранение
submit.to_csv('lgbm_baseline.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)