In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
columns_desc = pd.read_csv("./data/columns.csv", delimiter=";")

columns_desc = columns_desc.set_index("Feature Name (For participants)")[
    "Feature Description (For participants)"
].to_dict()

columns_desc

{'abon_id': 'Идентификатор Абонента',
 'target': 'Таргет',
 'Balance_uah': 'Баланс',
 'TM_ID': 'Тариф',
 'lt': 'Срок жизни абонента',
 'CALCULATION_METHOD_ID': 'Тип расчета',
 'loc_cnt_events': 'Общее количество событий',
 'loc_is_obl_center': 'Количество событий в областном центре',
 'loc_market_share': 'Доля рынка Vodafone в регионе',
 'bs_of_succ_but_drop_m1': 'Были ли обрывы соединений',
 'bs_of_succ_m1': 'Успешные события',
 'bs_of_unsucc_attemp_equip_m1': 'Неуспешные события из-за терминала абонента',
 'bs_of_unsucc_low_balance_m1': 'Неуспешные события из-за низкого баланса',
 'bs_of_attemps_all_m1': 'Количество попыток звонков',
 'bs_of_recall_m1': 'Количество перезвонов',
 'bs_succ_rate': '% успешных событий',
 'bs_drop_call_rate': '% обрывов звонков',
 'bs_drop_rate': '% обрывов передачи данных',
 'bs_recall_rate': '% перезвонов',
 'tsoa_direct_cnt': 'Количество событий контакта с Vodafone в прямом канале',
 'tsoa_call_cnt': 'Количество событий контакта с Vodafone в ЦОА',
 'ts

In [3]:
def describe_series(pd_series):
    # Get descriptive statistics
    description = pd_series.describe()

    # Convert to DataFrame
    description_df = pd.DataFrame(description).T

    # Add median and specific percentiles
    description_df["median"] = pd_series.median()
    description_df["25%"] = pd_series.quantile(0.25)
    description_df["75%"] = pd_series.quantile(0.75)

    description_df["Desc"] = columns_desc[pd_series.name]

    description_df.index.name = "Name"

    return description_df

In [4]:
with open("./data/churn_model_pd2.pcl", "rb") as f:
    bdl_data = pickle.load(f)

In [5]:
gen_info, train_data, test_data = bdl_data

train_info, df_train_fe, df_train_bnum, df_train_dpi = train_data
test_info, df_test_fe, df_test_bnum, df_test_dpi = test_data

df_train_fe_origin = df_train_fe.copy()
df_test_fe_origin = df_test_fe.copy()

In [59]:
# from ydata_profiling import ProfileReport

# profile_report = ProfileReport(df_train_fe, title="Churn clients", minimal=True)
# profile_report.to_file("reports/df_train_fe.html")

In [60]:
# import sweetviz as sw

# sw_report = sw.analyze(df_train_fe, target_feat="target", pairwise_analysis="off")
# sw_report.show_html("reports/df_train_fe_sw.html")

In [18]:
# Remove data with unsupported type (totally blank series)

UNSUPPORTED_COLUMNS = [
    'device_has_gprs',
    'bs_of_succ_m1',
    'bs_drop_call_rate',
    'bs_succ_rate',
    'bs_drop_rate',
    'bs_of_recall_m1',
    'bs_of_attemps_all_m1',
    'bs_of_unsucc_low_balance_m1',
    'bs_recall_rate',
    'bs_of_unsucc_attemp_equip_m1',
    'bs_of_succ_but_drop_m1',
]

df_train_fe = df_train_fe.drop(columns=UNSUPPORTED_COLUMNS)

Series([], Name: count, dtype: int64)

In [62]:
# Drop frauds
df_train_fe = df_train_fe[df_train_fe.MV_FRAUD_BLOCK != 1]
df_train_fe = df_train_fe.drop(columns=["MV_FRAUD_BLOCK"])

In [63]:
# NEW COLUMN: device_brand
# MISSING VALUE (MEDIANA): device_price

DEVICE_COLUMNS = [
    "device_brand_samsung",
    "device_brand_nokia",
    "device_brand_lenovo",
    "device_brand_apple",
    "device_brand_huawei",
    "device_brand_lg",
    "device_brand_xiaomi",
    "device_brand_meizu",
    "device_brand_prestigio",
    "device_brand_sony",
    "device_brand_other",
]

df_train_fe[DEVICE_COLUMNS] = df_train_fe[DEVICE_COLUMNS].fillna(0)
df_train_fe["device_brand"] = df_train_fe[DEVICE_COLUMNS].idxmax(axis=1)
df_train_fe["device_brand"] = df_train_fe["device_brand"].str.replace("device_brand_", "")

In [64]:
# Group by 'device_brand' and calculate the median for the specified columns
grouped_medians = df_train_fe.groupby("device_brand").median(numeric_only=True)

# Fill missing values with the median for each group
columns_to_fill = ["device_price", "imei_mean_price", "imei_max_price", "device_height_mm"]

for column in columns_to_fill:
    blank_df = df_train_fe[df_train_fe[column].isnull()]
    df_train_fe.loc[blank_df.index, column] = blank_df["device_brand"].map(grouped_medians[column])

In [65]:
MISSING_INDEX = 0

columns_with_int_dtype = df_train_fe.select_dtypes(include=["int64", "int32", "float64"]).columns
highly_missing_columns = df_train_fe.isnull().mean() * 100
highly_missing_columns = highly_missing_columns[highly_missing_columns > MISSING_INDEX].index

highly_missing_int_columns = set(highly_missing_columns) & set(columns_with_int_dtype)

# data = pd.DataFrame()

# for col in highly_missing_int_columns:
#     series_to_concat = describe_series(df_train_fe[col])
#     data = pd.concat([data, series_to_concat])

# print(data)

df_train_fe[list(highly_missing_int_columns)] = df_train_fe[list(highly_missing_int_columns)].fillna(0)

In [66]:
from sklearn import preprocessing
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
import lightgbm as lgbm

from datetime import datetime, timedelta

VERSION = 1.0


def save_model(model=None, features=[]):
    name = str(model.__class__).split(".")[-1][:-2] + "_" + str(VERSION) + ".pickle"
    if model:
        with open(f"models/{name}", "wb") as file:
            pickle.dump((model, features), file)
            print("Save", name)


def load(file_name):
    with open(f"models/{file_name}", "rb") as file:
        model_tpl = pickle.load(file)
    return model_tpl


def train(dataframe):
    dataframe = dataframe.copy()

    params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "scale_pos_weight": 94,
        "lambda_l1": 3.4925825074775134e-08,
        "lambda_l2": 1.541663754329288e-05,
        "num_leaves": 110,
    }

    X = dataframe.drop(columns=["target"], axis=1)
    y = dataframe.target

    lgbm_model = lgbm.LGBMClassifier(**params, verbose=-1)

    lgbm_model.fit(X, y)

    save_model(lgbm_model, list(X.columns))

    return lgbm_model


def predict(dataframe, version=VERSION):
    model, features = load("LGBMClassifier_1.0.pickle")

    dataframe = dataframe.copy()

    X = dataframe[features]

    return model.predict(X)

In [72]:
import importlib

importlib.reload(importlib.import_module("steps.set_missings"))

from steps.set_missings import set_missings

df_train_fe_processed = set_missings(df_train_fe_origin)
df_test_fe_processed = set_missings(df_test_fe_origin)

In [73]:
lgbm_model = train(df_train_fe_processed)

Save LGBMClassifier_1.0.pickle


In [74]:
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import confusion_matrix

X_train = df_train_fe_processed.drop("target", axis=1)
y_train = df_train_fe_processed.target

X_test = df_test_fe_processed.drop("target", axis=1)
y_test = df_test_fe_processed.target

y_train_pred = predict(X_train)
y_test_pred = predict(X_test)


def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)


def fpr_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fp / (tn + fp)


print("accuracy train", accuracy_score(y_train, y_train_pred))
print("accuracy test", accuracy_score(y_test, y_test_pred))

print("precision train", precision_score(y_train, y_train_pred))
print("precision test", precision_score(y_test, y_test_pred))

print("fpr train", fpr_score(y_train, y_train_pred))
print("fpr test", fpr_score(y_test, y_test_pred))

accuracy train 0.8527780370404938
accuracy test 0.8228466666666666
precision train 0.30267146646456994
precision test 0.2339799206742687
fpr train 0.15727176258608533
fpr test 0.17582878724297105
