#### Импорт всех нужных библиотек

In [242]:
import lightgbm as lgb
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, precision_score, recall_score
import joblib
import pandas as pd
import numpy as np

#### Загрузка датасета

**Описание датасета:** Страховые компании берут на себя риски клиентов. Управление рисками является очень важным аспектом страховой отрасли. Страховщики учитывают каждый поддающийся количественной оценке фактор для разработки профилей высоких и низких страховых рисков. Страховщики собирают огромное количество информации о страхователях и анализируют данные. В этом проекте нужно будет проанализировать имеющиеся данные и предсказать, применять ли санкции к страхованию или нет.

In [163]:
dataset = pd.read_csv('dataset.csv')

In [164]:
dataset.head()

Unnamed: 0,ID,Age,Agency,Agency Type,Commision (in value),Destination,Distribution Channel,Duration,Gender,Net Sales,Product Name,Claim
0,45341,28,C2B,Airlines,28.13,SINGAPORE,Online,34,F,112.5,Silver Plan,1
1,12958,37,JZI,Airlines,12.95,PHILIPPINES,Online,53,F,37.0,Basic Plan,0
2,18233,27,EPX,Travel Agency,0.0,UNITED STATES,Online,28,,13.0,Cancellation Plan,0
3,31742,36,EPX,Travel Agency,0.0,SAUDI ARABIA,Online,1,,34.0,Cancellation Plan,0
4,14381,26,CWT,Travel Agency,23.76,THAILAND,Online,33,,39.6,Rental Vehicle Excess Insurance,0


#### Предобработка данных

In [165]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62288 entries, 0 to 62287
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ID                    62288 non-null  int64  
 1   Age                   62288 non-null  int64  
 2   Agency                62288 non-null  object 
 3   Agency Type           62288 non-null  object 
 4   Commision (in value)  62288 non-null  float64
 5   Destination           62288 non-null  object 
 6   Distribution Channel  62288 non-null  object 
 7   Duration              62288 non-null  int64  
 8   Gender                22713 non-null  object 
 9   Net Sales             62288 non-null  float64
 10  Product Name          62288 non-null  object 
 11  Claim                 62288 non-null  int64  
dtypes: float64(2), int64(4), object(6)
memory usage: 5.7+ MB


In [166]:
dataset.describe()

Unnamed: 0,ID,Age,Commision (in value),Duration,Net Sales,Claim
count,62288.0,62288.0,62288.0,62288.0,62288.0,62288.0
mean,32844.953458,39.666324,12.829703,60.958804,50.717064,0.200006
std,18065.417216,14.014652,23.498745,114.32533,63.166715,0.400008
min,0.0,0.0,0.0,-2.0,-389.0,0.0
25%,17579.0,33.0,0.0,10.0,20.0,0.0
50%,33446.5,36.0,1.88,25.0,29.7,0.0
75%,48532.25,43.0,14.44,59.0,58.0,0.0
max,63323.0,118.0,262.76,4881.0,682.0,1.0


**1.** В колонке Duration минимальное значние "-2", так как продолжительность поездки не может быть меньше 0. Такие данные надо выбросить. Но и максимаьлная продолжительность в 4881 дней, кажется очень большой. Давайте установим максимальное значение "Duration" в 1000 дней.

**2.** Также добавим колонку категориальный признак: сгруппируем клиентов по возрасту ("ребенок", "взрослый", "пожилового возраста"). 

**3.** В колонке пола есть очень много пропусков, который практически никак не заполнить. Поэтому удалим ее.

In [167]:
def age_convert(age):
    result = ''
    if(age <= 21):
        result = 'Child'
    elif(age <= 60):
        result = 'Adult'
    else:
        result = 'Senior'
    return result

In [168]:
def data_pre_processing(df):
    
    df['Age Group'] = df['Age'].map(lambda x: age_convert(x))

    df.drop('Gender',axis=1, inplace=True)
    
    df.loc[df.Duration < 0, 'Duration'] = df['Duration'].median()

    df.loc[df.Duration > 1000, 'Duration'] = 1000
    
    df = pd.get_dummies(df, columns=['Agency', 'Agency Type', 'Destination', 'Distribution Channel', 'Product Name', 'Age Group'], drop_first=True)
    
    return df

In [169]:
dataset = data_pre_processing(dataset)

In [170]:
dataset.head()

Unnamed: 0,ID,Age,Commision (in value),Duration,Net Sales,Claim,Agency_ART,Agency_C2B,Agency_CBH,Agency_CCR,...,Product Name_Silver Plan,Product Name_Single Trip Travel Protect Gold,Product Name_Single Trip Travel Protect Platinum,Product Name_Single Trip Travel Protect Silver,Product Name_Spouse or Parents Comprehensive Plan,Product Name_Ticket Protector,Product Name_Travel Cruise Protect,Product Name_Value Plan,Age Group_Child,Age Group_Senior
0,45341,28,28.13,34,112.5,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,12958,37,12.95,53,37.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18233,27,0.0,28,13.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,31742,36,0.0,1,34.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,14381,26,23.76,33,39.6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [176]:
X = dataset.drop('Claim', axis = 1)
y = dataset['Claim']

#### Реализация стримингового прочтения файлов

Попробуем воссоздать потом реальных данных. Для этого разобьем данные на батчи и будем их по порядку считывать.

In [261]:
def streaming_reading(X_train, y_train, batch_size=5000):
    X = []
    y = []
    current_line = 0
    train_data, train_label = shuffle(X_train, y_train, random_state=0)
    train_data = train_data.to_numpy()
    for row, target in zip(train_data, train_label):
        X.append(row)
        y.append(target)

        current_line += 1
        if current_line >= batch_size:
            X, y = np.array(X), np.array(y)
            yield X, y
            X, y = [], []
            current_line = 0

#### Реализация lightgbm 

In [266]:
def IncrementaLightGbm(X, y):  
    gbm = None

    params = {
        'task': 'train',
        'application': 'binary',  
        'boosting_type': 'gbdt', 
        'learning_rate': 0.05,  
        'tree_learner': 'serial',
        'metric': ['binary_logloss', 'auc'], 
        'max_bin': 255,
    }
    streaming_train_iterators = streaming_reading(X, y, batch_size=5000)

    for i, data in enumerate(streaming_train_iterators):
        X_batch = data[0]
        y_batch = data[1]
        X_train, X_test, y_train, y_test = train_test_split(X_batch, y_batch, test_size=0.1, random_state=0)
        y_train = y_train.ravel()
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

        gbm = lgb.train(params,
                        lgb_train,
                        num_boost_round=1000,
                        valid_sets=lgb_eval,
                        init_model=gbm, 
                        early_stopping_rounds=10,
                        verbose_eval=False,
                        keep_training_booster=True)  

        print("{} time".format(i))  
        score_train = dict([(score[1], score[2]) for score in gbm.eval_train()])
        print('The score of the current model in the training set is: logloss=%.4f, auc=%.4f, \n'
              % (score_train['binary_logloss'], score_train['auc']))

    return gbm

#### Запуск обучения

In [267]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2)
gbm = IncrementaLightGbm(train_X, train_y)
pred_y = gbm.predict(test_X)
pred_classes = np.where(pred_y > 0.5, 1, 0)
print(f'F1 score: {f1_score(test_y, pred_classes)}')
print('------------------------------------------')
print(f'Precision: {precision_score(test_y, pred_classes)}')
print('------------------------------------------')
print(f'Recall: {recall_score(test_y, pred_classes)}')

joblib.dump(gbm, 'loan_model.pkl')
gbm = joblib.load('loan_model.pkl')

[LightGBM] [Info] Number of positive: 894, number of negative: 3606
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1106
[LightGBM] [Info] Number of data points in the train set: 4500, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.198667 -> initscore=-1.394649
[LightGBM] [Info] Start training from score -1.394649
0 time
The score of the current model in the training set is: logloss=0.2857, auc=0.9300, 

[LightGBM] [Info] Number of positive: 867, number of negative: 3633
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 4500, number of used features: 60
1 time
The score of the current model in the training set is: logloss=0.2185, auc=0.9649, 

[LightGBM] [Info] Number of positive: 899, number 