In [120]:
%load_ext autoreload
%autoreload 2

import optuna
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import shap
import numpy as np
import scipy
import seaborn as sns

from steps.prepare_data import load_processed_data, load_split_processed_data
from utils.model import predict, load_model, predict_booster, predict_booster_model
from steps.load_data import LoadData

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
data_loader = LoadData()

# data_loader.df_train_fe.abon_id
df_train_bnum = data_loader.df_train_bnum.copy()

df_train_bnum["target"] = 0

abon_target = data_loader.df_train_fe[["target", "abon_id"]]

df_train_bnum["target"] = df_train_bnum["abon_id"].map(abon_target.set_index("abon_id")["target"])

# Percentage of boolean columns
df_train_bnum["target"].value_counts(normalize=True)

df_train_bnum = df_train_bnum.copy()

activity_columns = [
    "call_cnt_out",
    "call_cnt_in",
    "call_dur_out",
    "call_dur_in",
    "cnt_sms_out",
    "cnt_sms_in",
]

In [None]:
short_numbers = [
    "111",
    "1020",
    "275",
    "3700",
    "277",
    "5010",
    "3135",
    "273",
    "3133",
    "112",
    "7777",
    "1545",
    "4224",
    "1525",
    "30094",
    "1648",
    "2233",
    "555",
    "1551",
    "30043",
    "2250",
    "1050",
    "225",
    "1535",
    "1648",
    "5000",
    "4666",
]

In [None]:
import requests
import json


def fetch_phone_information(short_number: str):
    try:
        base_url = "https://www.vodafone.ua/api/short_numbers/?short_number={variable}&Voice=on&SMS=on"

        url = base_url.format(variable=short_number)

        response = requests.get(url)

        payload = json.loads(response.content.decode("utf-8"))

        return payload["data"][0]["title"]
    except:
        return "Not found"

In [None]:
# short_numbers_dict = {}

# for number in short_numbers:
#     short_numbers_dict[number] = fetch_phone_information(number)

short_numbers_dict

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time


def get_phonebook_header(phone_number):
    url = "https://www.telefonnyjdovidnyk.com.ua/"

    # Set up headless Chrome
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

    # Open the URL
    driver.get(url)

    # Find the search input, enter the phone number, and submit the form
    search_input = driver.find_element("name", "search")
    search_input.send_keys(phone_number)
    search_input.submit()

    # Wait for the page to load (you may need to adjust the sleep time)
    time.sleep(3)

    # Get the page source and parse it with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Look for the phonebook-header class
    phonebook_header = soup.find(class_="phonebook-header")

    # If not found, look for the first h3 tag
    if not phonebook_header:
        phonebook_header = soup.find("h3")

    driver.quit()

    if phonebook_header:
        header_text = phonebook_header.get_text(strip=True)
        return {"header": header_text}
    else:
        return {"error": "phonebook-header or h3 not found"}

In [None]:
some_numbers_dict = {}

In [None]:
# for number in some_numbers:
#     print("Fetching", number)

#     if some_numbers_dict.get(number) and some_numbers_dict.get(number) != "phonebook-header or h3 not found":
#         print("Already fetched", some_numbers_dict[number])
#         continue

#     some_numbers_dict[number] = get_phonebook_header(number)
#     print("Fetched", some_numbers_dict[number])

In [None]:
some_numbers_dict = {
    "380800210800": {"header": "Аферисти Київ"},
    "380800500500": {"header": "РАЙФФАЙЗЕН БАНК АВАЛЬ, ПАТ, ДНІПРОПЕТРОВСЬКАОБЛАС"},
    "380800305555": {"header": "Телефонне шахрайство: у МВС розповіли, хто і як виманює гроші в українців"},
    "380800500850": {"header": "КРЕДОБАНК,АТ"},
    "380800307010": {"header": "БАНК ВОСТОК,ПАТ"},
    "380800502050": {"header": "SenseBank"},
    "380800504450": {"header": "ЕКСПОРТНО-ІМПОРТНИЙ БАНК УКРАЇНИ, ПАТ, ЧЕРНІВЕЦЬКАФІЛІЯ"},
    "380800502030": {"header": "ІДЕЯ БАНК,АТ"},
    "380800507700": {"header": "БАНК КРЕДИТ ДНІПРО,АТ"},
    "380800504400": {"header": "МЕГАБАНК,АТ"},
    "380800309000": {"header": "УКРГАЗБАНК, АКЦІОНЕРНИЙБАНК"},
    "380800307030": {"header": "ПІВДЕННИЙ, ПУБЛІЧНЕ АТ АКЦІОНЕРНИЙБАНК"},
    "380442474040": {"header": "КИЇВТЕПЛОЕНЕРГО,КП"},
    "380442907290": {"header": "ПЕРШИЙ УКРАЇНСЬКИЙ МІЖНАРОДНИЙБАНК"},
    "380443630133": {"header": "ОЩАДБАНК, УКРАЇНСЬКИЙ КОМЕРЦІЙНИЙБАНК"},
    "380442020202": {
        "header": "У Черкасах телефонні шахраї заробляють десятки тисяч гривень на довірливості містян"
    },
    "380443519998": {"header": "Інтерес до цього номера за останній місяць"},
    "380443519999": {"header": "ЮНІКАСТ ІНВЕСТ,ТОВ"},
    "380442050020": {"header": "СІНЕВО УКРАЇНА,ТОВ"},
    "380442329925": {"header": "ЗАКАЗ. ЮА,ТОВ"},
    "380442021588": {"header": "КИЇВ ЕНЕРГО,АТ"},
    "380442224222": {"header": "СПОРТ ЛАЙФ УКРАЇНА,ТОВ"},
    "380442470786": {"header": "STIL EVROTOV"},
    "380444590740": {"header": "КИЇВОБЛЕНЕРГО"},
    "380444908888": {"header": "РАЙФФАЙЗЕН БАНК АВАЛЬ,АТ"},
    "380444950405": {"header": "Як не стати жертвою телефонних шахраїв"},
    "380444944320": {"header": "Інтерес до цього номера за останній місяць"},
    "380444900500": {"header": "ОТП БАНК,АТ"},
    "380445038080": {"header": "Це виртуальний оператор"},
    "380445276363": {"header": "Як не стати жертвою телефонних шахраїв. Пояснюють експерти"},
    "380445370222": {"header": "Телефонне платіжне шахрайство –#ШахрайГудбай"},
    "380445001717": {
        "header": "У Черкасах телефонні шахраї заробляють десятки тисяч гривень на довірливості містян"
    },
    "380445000303": {"header": "ХАРДТЕХСЕРВІС,ТОВ"},
    "380442901988": {"header": "Шахрайство в мережі та телефонне шахрайство"},
    "380442221111": {"header": "Як діяти українцям, коли телефонують шахраї"},
    "380442204404": {
        "header": "Обережно: телефонне шахрайство! Чернігівців «розводять» на гроші під час пошуку роботи"
    },
    "380442220333": {
        "header": "КІБЕРПОЛІЦІЯ ЗАСТЕРІГАЄ ВІД ТЕЛЕФОННИХ ШАХРАЇВ, ЯКІ ВИДАЮТЬ СЕБЕ ЗА СПІВРОБІТНИКІВ БАНКУ"
    },
    "380442009010": {"header": "Мошенники “разводят” предпринимателей"},
    "380487059090": {"header": "Інтерес до цього номера за останній місяць"},
    "380487300030": {"header": "ODREX, МЕДИЧНИЙДІМ"},
    "380487055555": {"header": "Телефонне шахрайство: у МВС розповіли, хто і як виманює гроші в українців"},
    "380567934826": {"header": "STANTSIYA DNIPROPETROVSKFILIA"},
    "380567878104": {"header": "На Рівненщині 1,5 року обмеження волі за телефонне шахрайство отримав волинянин"},
    "380567900999": {"header": "Інтерес до цього номера за останній місяць"},
    "380577075757": {"header": "ХАРКІВВОДОКАНАЛ,КП"},
    "380577209720": {"header": "МАКСНЕТ"},
    "380577665104": {"header": "ХАРКІВМІСЬКГАЗ,ПАТ"},
    "380666163133": {"header": "На Рівненщині за зухвале телефонне шахрайство піде під суд житель Миколаївщини"},
    "380670107104": {"header": "ХАРКІВМІСЬКГАЗ,ПАТ"},
    "380674660466": {"header": "Київстар"},
    "380677880884": {"header": "Що треба знати про мобільне шахрайство"},
    "380931710000": {"header": "ЦЕНТР ЦИФРАЛ СЕРВІС ЗАПОРІЖЖЯ,ТОВ"},
    "380931771611": {"header": "ЛАЗЕРХАУЗ,ТОВ"},
    "380952331515": {"header": "ЗАПОРІЗЬКІ ЦИФРОВІ КОМУНІКАЦІЇ,ТОВ"},
    "380957321212": {
        "header": "Які види шахрайства розповсюджені на Житомирщині і як не стати жертвою шахраїв, розповіли у…"
    },
    "380958788181": {
        "header": "КІБЕРПОЛІЦІЯ ЗАСТЕРІГАЄ ВІД ТЕЛЕФОННИХ ШАХРАЇВ, ЯКІ ВИДАЮТЬ СЕБЕ ЗА СПІВРОБІТНИКІВ БАНКУ"
    },
    "380962907290": {"header": "ПЕРШИЙ УКРАЇНСЬКИЙ МІЖНАРОДНИЙБАНК"},
    "380984500609": {"header": "НОВА ПОШТА,КОМПАНІЯ"},
    "380995055757": {
        "header": "НАЦІОНАЛЬНИЙ БАНК УКРАЇНИ ЗАПУСТИВ ІНФОРМАЦІЙНУ КАМПАНІЮ З ПРОТИДІЇ ШАХРАЙСТВУ #ШАХРАЙГУДБАЙ"
    },
    "380997344444": {"header": "ФРЕГАТ ІСП,ТОВ"},
}

len(some_numbers_dict)

In [121]:
# df_train_bnum.bnum.value_counts().sort_values(ascending=False).index.to_list()

bank_companies = [
    "privatbank",
    "oschadbank",
    "raiffeisen",
    "monobank",
    "mono",
    "alfabank",
    "pumb",
    "otp bank",
    "ukrgasbank",
    "a-bank",
    "tascombank",
    "ukreximbank",
    "bankvostok",
    "pivdenny",
    "globusbank",
    "ukreximban",
    "poltavaban",
    "bank bls",
    "accordbank",
    "creditdnepr",
    "megabank",
    "concordbank",
    "radabank",
    "procredit",
    "industrial",
    "pravex ban",
    "rws bank",
    "bankago",
    "credobank",
    "forwardban",
    "sportbank",
    "idea bank",
    "mtb bank",
    "piraeusbank",
    "bankforward",
    "alf-ua.com",
    "kredobank",
    "sichbank",
    "izibank",
    "pumb onlin",
    "ideabank",
    "cagricole",
]

work_companies = [
    "rabota.ua",
    "work.ua",
    "jooble",
    "linkedin",
    "itstep",
    "skillup",
    "rabota",
    "start-work",
]

grocery_companies = [
    "varus",
    "silpo",
    "fozzy",
    "myasomarke",
    "metro",
    "tavria_v",
    "eko market",
    "фрукты",
]

post_companies = [
    "380984500609",
    "novaposhta",
]

taxy_companies = [
    "taxi-838",
    "uklon",
    "bolt",
    "uber",
    "shark taxi",
    "taxi 3040",
    "opti-579",
    "ontaxi",
    "taxi 2288",
    "taxi vezi",
    "taxi 323",
    "taxi 571",
    "taxi-808",
    "eco taxi",
    "taxi-280",
    "taximer",
    "taxi 309",
    "taxi 777",
    "taxi-653",
    "taxi 959",
    "2233",
    "3135",
    "3133",
]

credits_companies = [
    "credit_plus",
    "credit7.ua",
    "mycredit",
    "ze.kredit",
    "moneyveo",
    "mycredit.ua",
    "creditdnepr",
    "e-groshi",
    "credit7",
    "clickcredit",
    "soscredit",
    "crediton",
    "creditexpr",
    "creditdnep",
    "credit_plus",
    "creditplu",
    "creditdebt",
    "zecredit",
    "kredit",
    "alexcredit",
    "shgroshi",
    "creditpod",
    "credit7.ua",
    "money4you",
    "dinero",
    "kreditytut",
    "creditpod-0",
    "clickcredi",
    "kredit4u",
    "e-wings",
    "creditexpr",
    "minizaem",
    "money24",
    "moneyboom",
    "moneyextra",
    "moneylove",
    "zss",
    "creditplus",
    "k-kapital",
    "egroshicom",
    "skarbcomua",
    "credit_plu",
    "miloan.ua",
    "mycredit-u",
    "mycredit-ua",
    "mycredit.u",
    "ua-mycredi",
    "ua-mycredit",
    "mycreditua",
    "ccloan",
    "loanyua",
    "ze.kred1t",
    "selficredi",
    "mistercash",
    "selficredit",
    "sos credit",
    "zecredi",
    "dengi dozp",
    "otp credit",
    "ze_kred1t",
    "sloncredit",
    "bingocash",
    "cash point",
    "cashberry",
    "slon credi",
    "domonet",
    "yoomoney",
    "creditpod-",
    "slon credit",
    "moneta-z",
    "bankacredi",
    "cash-kf",
    "advcash",
    "money.4.yo",
    "moneyexpert",
    "money-4-yo",
    "aviracredi",
    "money-4-you",
    "icredit",
    "creditbox",
    "money.4.you",
    "credit",
    "zekredi",
    "el caso",
    "creditup",
    "monetka",
    "dengivsim",
    "credit365",
    "moneyexper",
    "aviracredit",
    "kredit plu",
    "crediglbl",
    "bestcredit",
    "neocredit",
    "cashhelp",
    "telmone",
    "microcash",
    "novikredyt",
    "extramoney",
    "clycredit",
    "creditcafe",
    "ewacash",
    "kg-money",
    "creditik",
    "casharing",
    "case 24",
    "kreditstar",
    "forzacredit",
    "webmoney.ua",
    "forzacredi",
    "kredit plus",
    "ecase",
    "caseshop",
    "webmoney.u",
    "macincase",
    "verocash",
    "techno-cas",
    "novikredyty",
    "cash24",
    "onlycredit",
    "portmone",
    "mrmoney",
    "globalcredi",
    "credit-pro",
    "cashdesk",
    "stormoney",
    "credit2u",
    "opencredit",
    "smartmoney",
    "kvk-cash",
    "hotcredit",
    "techno-case",
    "cash365",
    "bestcredits",
    "bankacredit",
    "cash-ua",
    "sweet mone",
    "kredit1",
    "cashinua",
    "sweetmoney",
    "tviy cash",
    "moneyua",
    "catcredit",
    "money.spac",
    "topcredit",
    "sweet money",
    "pvks-kredit",
    "fotocaseua",
    "cashinsky",
    "easy cash",
    "yourmoney",
    "credbox",
    "kreditavans",
    "credit24",
    "casinoin",
    "globalcred",
    "caseller",
    "n1casino",
    "recredit",
    "happycredi",
    "uitracredit",
    "onecase",
    "seacredit",
    "smartcredi",
    "moneysmash",
    "pvks-kredi",
    "gdcashmere",
    "cool credi",
    "mlcrocredit",
    "turbocash",
    "allrightcas",
    "bystro.cas",
    "kreditavan",
    "blago cash",
    "ultracredi",
    "e-cash",
    "creditmax",
    "money_poin",
    "icases.ua",
    "dengi24",
    "hit_cash",
    "credit ok",
    "intercash",
    "monese",
    "elitcredit",
    "ify.credit",
    "dorcas",
    "moneyglad",
    "ifycredit",
    "uakredit",
    "bystro.cash",
    "kreditor",
    "dengidengi",
    "cashbe",
    "micredit",
    "casemaniac",
    "crazy case",
    "silver_cas",
    "sens credit",
    "mlcrocredi",
    "casey",
    "atom case",
    "cascata",
    "cashbox",
    "luckycash",
    "creditavir",
    "glad4money",
    "casekey",
    "kredit vsem",
    "multicast",
    "ultracashua",
    "hypno-casa",
    "avanscredit",
    "money_help",
    "casofficial",
    "incredo",
    "jeanscasual",
    "4-cases",
    "cool credit",
    "simplemone",
    "moneyup",
    "unicredit",
    "secretcase",
    "silvermone",
    "moneyjar",
    "nillkincase",
    "oncredit",
    "pancredit",
    "the-credit",
    "kredit_112",
    "flashcash",
    "creditbot",
    "creditorxx",
    "money_flas",
    "elitcases",
    "money_flash",
    "globalmone",
    "selectmoney",
    "prostomone",
    "kredenscafe",
    "moneyvalue",
    "sens credi",
    "money_star",
    "1case",
    "credit4u",
    "uitracredi",
    "mistermoney",
    "credit bot",
    "micrediton",
    "bit_money",
    "monet home",
    "tuskcasino",
    "casofficia",
    "prostomoney",
    "ultracredit",
    "reficredit",
    "ab.case",
    "money club",
    "moneysend",
    "creditnice1",
    "creditlite",
    "personcase",
    "bananacred",
    "likemoney",
    "creditavira",
    "pocketmone",
    "new_money",
    # -------- TEST Datatset ----------
    "cash.ua",
    "cashalot",
    "money.space",
    "incasso",
    "mistermone",
    "kredit-0",
    "zaxidkredit",
    "skymoney",
    "veo_cash",
    "kredit vse",
    "starcash",
    "simplemoney",
    "casual",
    "happycredit",
    "pocketmoney",
    "easy money",
    "takeurmoney",
    "888casino",
    "shipmoney",
    "mobilcase",
    "ify-credit",
    "foodpicasso",
    "silvermoney",
    "maxicredit",
    "vivid mone",
    "bezcredito",
    "squadcast",
    "mcmoney",
    "silver_cash",
    "case2case",
    "globalmoney",
    "nillkincas",
    "lucyscasino",
    "kredo-shop",
    "whitecredi",
    "micash",
    "cashpro",
    "cash4u",
    "kreditsous",
    "docassist",
    "bezcreditov",
    "dengivdolg",
    "whitecredit",
    "nicecredit",
    "creditnice",
    "microcredit",
    "youcash",
    "money_point",
    "credltmone",
    "pumb online",
]


food_delivery_companies = [
    "dominos",
    "sushi wok",
    "sushimaster",
    "sushi icons",
    "pizza 33",
    "sushimaste",
    "sushimaste",
    "glovo",
    "budusushi",
    "smilefood",
    "sushimaster",
    "sushi-poin",
    "sushi boss",
    "sushimaste",
    "sushimaste",
]


health_companies = [
    "e-health",
    "apteka911",
    "synevo",
    "helsi",
    "medcard24",
    "24/7 лікар",
    "med-servic",
    "med-service",
    "medcity.ua",
    "liki24.com",
    "apteka24.ua",
    "aptekanetu",
    "apteka d.s.",
    "podorozhnyk",
    "med-servlc",
    "apteka nc",
    "podorozhny",
    "med-servlce",
    "likar.info",
    "medcity",
    "likar",
    "apteka d.s",
    "leleka",
]

delivery_companies = [
    "novaposhta",
    "ukrposhta",
    "meestua",
    "global24",
]

competitors_companies = [
    "kyivstar",
    "vodafone ua",
    "lifecell",
    "kyivdigital",
]

competitors_provider_companies = [
    "ukrtelecom",
    "triolan",
    "datagroup",
    "langate",
    "fregat.com",
    "volia",
    "vega",
    "viasat",
]

shops_companies = [
    "rozetka",
    "prom.ua",
    "allo",
    "olx",
    "makeup",
    "citrus.ua",
    "stylus",
    "eldorado",
    "comfy",
    "foxtrot.ua",
    "epicentrk",
    "colins",
    "eva",
    "yakaboo",
    "exist.ua",
    "metro",
    "eva-mozayk",
    "mycredit.ua",
    "epic games",
    "bonjour",
    "citrus",
    "dzvlnok",
    "domino's",
    "fast box",
    "gift-servi",
    "kids-room",
    "kioto",
    "lemon.box",
    "link.dating",
    "mall",
    "medav",
    "ohlala",
    "photo-room",
    "pond",
    "rozetka, магазин",
    "shopster",
    "sofa_dream",
    "stay.cafe",
    "студия",
    "студия.меб",
    "ушастик",
    "холдинг",
    "цифровой",
    "шары",
    "shop_zakaz",
    "top-shop",
    "віці",
    "інтернет",
    "юа_магазин",
]

messengars_companies = [
    "viber",
    "whatsapp",
    "telegram",
    "facebook",
    "google",
    "instagram",
    "tiktok",
    "snapchat",
    "linkedin",
    "zoom",
    "discord",
    "twitter",
]

cyberpolice_companies = [
    "cyberpolice",
    "cyberpolic",
]

vodafone_support = [
    "111",
]

vodafone_survey = [
    "273",
    "275",
    "277",
]

vodafone_new_customer = [
    "222",
]

vodafone_services = [
    "30094",
    "7777",
    "30094",
    "5010",
    "1020",
    "2828",
    "30042",
    "vodafone u",
]

fraud_numbers = [
    "380800305555",
    "380442020202",
    "380444950405",
    "380445276363",
    "380445370222",
    "380442220333",
    "380442009010",
    "380442204404",
    "380442901988",
    "380567878104",
    "380666163133",
    "380677880884",
    "380957321212",
    "380958788181",
    "380443519998",
    "380995055757",
    "380800210800",
]

bank_numbers = [
    "380800500500",
    "380800500850",
    "380800307010",
    "380800502050",
    "380800504450",
    "380800502030",
    "380800507700",
    "380800504400",
    "380800309000",
    "380800307030",
    "380442907290",
    "380443630133",
    "380444908888",
    "380444900500",
    "380962907290",
    "729",
    "3700",
]

competitors_numbers = [
    "380674660466",
    "380997344444",
    "380505022250",
]

rescue_numbers = [
    "112",
    "102",
    "103",
    "dsns ukr",
]

verify_numbers = [
    "verify",
]

phone_companies = [
    "xiaomi",
    "apple",
    "samsung",
    "huawei",
]

In [122]:
bnum_categories_dict = {
    "bank_companies": bank_companies,
    "work_companies": work_companies,
    "grocery_companies": grocery_companies,
    "post_companies": post_companies,
    "taxy_companies": taxy_companies,
    "credits_companies": credits_companies,
    "food_delivery_companies": food_delivery_companies,
    "health_companies": health_companies,
    "delivery_companies": delivery_companies,
    "competitors_companies": competitors_companies,
    "competitors_provider_companies": competitors_provider_companies,
    "shops_companies": shops_companies,
    "messengars_companies": messengars_companies,
    "cyberpolice_companies": cyberpolice_companies,
    "vodafone_support": vodafone_support,
    "vodafone_survey": vodafone_survey,
    "vodafone_services": vodafone_services,
    "vodafone_new_customer": vodafone_new_customer,
    "fraud_numbers": fraud_numbers,
    "bank_numbers": bank_numbers,
    "competitors_numbers": competitors_numbers,
    "rescue_numbers": rescue_numbers,
    "verify_numbers": verify_numbers,
    "phone_companies": phone_companies,
}

finance_topic = [
    "bank_companies",
    "bank_numbers",
    "credits_companies",
]

vodafone_topic = [
    "vodafone_support",
    "vodafone_survey",
    "vodafone_services",
    "vodafone_new_customer",
]

casual_topic = [
    "work_companies",
    "grocery_companies",
    "post_companies",
    "taxy_companies",
    "food_delivery_companies",
    "health_companies",
    "delivery_companies",
    "shops_companies",
    "fraud_numbers",
    "rescue_numbers",
    "verify_numbers",
    "phone_companies",
    "cyberpolice_companies",
    "other",
]

competitors_topic = [
    "competitors_companies",
    "competitors_provider_companies",
    "competitors_numbers",
]

messengars_topic = ["messengars_companies"]


bnum_topics = {
    "finance_topic": finance_topic,
    "vodafone_topic": vodafone_topic,
    "casual_topic": casual_topic,
    "competitors_topic": competitors_topic,
    "messengars_topic": messengars_topic,
}


def assign_bnum_category(dataframe, bnum_categories_dict):
    dataframe["bnum_category"] = "other"
    dataframe["bnum_topic"] = "other"

    for category, numbers in bnum_categories_dict.items():
        dataframe["bnum_category"] = np.where(
            dataframe["bnum"].isin(numbers), category, dataframe["bnum_category"]
        )

    for category, numbers in bnum_topics.items():
        dataframe["bnum_topic"] = np.where(
            dataframe["bnum_category"].isin(numbers), category, dataframe["bnum_topic"]
        )

    vodafone_all = ["vodafone_support", "vodafone_survey", "vodafone_services", "vodafone_new_customer"]

    casual = [
        "work_companies",
        "grocery_companies",
        "post_companies",
        "taxy_companies",
        "food_delivery_companies",
        "health_companies",
        "delivery_companies",
        "shops_companies",
        "other",
    ]

    dataframe["vodafone"] = np.where(dataframe["bnum_category"].isin(vodafone_all), True, False)
    dataframe["casual"] = np.where(dataframe["bnum_category"].isin(casual), True, False)

    return dataframe


# test_df = assign_bnum_category(df_train_bnum, bnum_categories_dict)

In [None]:
df_train_bnum[df_train_bnum.bnum_category == "other"].bnum.value_counts().sort_values(ascending=False).head(
    100
).to_dict()

In [None]:
import re


def check_word(word):
    return bool(re.search(r"грош|mone|деньг|deng|cas|cred|кред|kred", word, re.IGNORECASE))


others_bnums = (
    df_train_bnum[df_train_bnum.bnum_category == "other"]
    .bnum.value_counts()
    .sort_values(ascending=False)
    .index.to_list()
)

credit_phrases = []

for phrase in others_bnums:
    if check_word(phrase):
        credit_phrases.append(phrase)

credit_phrases

In [None]:
churn_true = df_train_bnum[df_train_bnum["target"] == 1]
churn_false = df_train_bnum[df_train_bnum["target"] == 0]

In [None]:
churn_true.bnum_category.value_counts(normalize=True).sort_values(ascending=False).head(10)

In [None]:
churn_false.bnum_category.value_counts(normalize=True).sort_values(ascending=False).head(10)

In [None]:
def build_feature_share_df(dataframe, feature_name, agg="sum"):
    df = (
        dataframe.groupby("bnum_category")
        .agg({feature_name: agg})
        .sort_values(by=feature_name, ascending=False)
        .reset_index()
    )

    df[f"{feature_name}_share"] = (df[feature_name] / df[feature_name].sum()) * 100

    return df


def bnum_category_grouped_feature_histplot(dataframe, feature_name, agg="sum", **params):
    df = build_feature_share_df(dataframe, feature_name, agg=agg)

    return sns.histplot(
        data=df,
        x="bnum_category",
        weights=f"{feature_name}_share",
        discrete=True,
        kde=False,
        **params,
    )

In [None]:
# Compare feature share with SUN

fig, axs = plt.subplots(3, 2, figsize=(20, 16))

for index, feature in enumerate(activity_columns):
    row = index // 2
    col = index % 2

    plt.sca(axs[row, col])
    bnum_category_grouped_feature_histplot(churn_false, feature, color="blue", label="Churn False", fill=True)
    bnum_category_grouped_feature_histplot(churn_true, feature, color="red", label="Churn True", fill=False)

    # Adding labels and title
    axs[row, col].tick_params(axis="x", rotation=90)
    axs[row, col].set_xlabel("bnum_category")
    axs[row, col].set_ylabel(f"Percentage {feature}")
    axs[row, col].set_title(f"Normalized Histogram of '{feature}' (by SUM in bnum category)")
    axs[row, col].legend()

plt.subplots_adjust(hspace=0.5)
plt.tight_layout()
plt.show()

### Vodafone Services


In [None]:
for phone in vodafone_services:
    info = fetch_phone_information(phone)
    print(phone, info)

In [None]:
churn_true_vodafone_services = churn_true[churn_true.bnum_category == "vodafone_services"]

churn_true_vodafone_services_agg = churn_true_vodafone_services.groupby("bnum").sum()

churn_true_vodafone_services_agg

# INSIGHT: cnt_sms_in. Something wrong with 1020


In [None]:
df = churn_true_vodafone_services_agg.cnt_sms_in.reset_index()

df["share"] = df.cnt_sms_in / df.cnt_sms_in.sum()

df.sort_values("share", ascending=False)

# INSIGHT: cnt_sms_out. Something wrong with 5010


In [None]:
df = churn_true_vodafone_services_agg.cnt_sms_out.reset_index()

df["share"] = df.cnt_sms_out / df.cnt_sms_out.sum()

df.sort_values("share", ascending=False)

### Vodafone Support


In [None]:
for phone in vodafone_support:
    info = fetch_phone_information(phone)
    print(phone, info)

In [None]:
churn_true_vodafone_support = churn_true[churn_true.bnum_category == "vodafone_support"]

churn_true_vodafone_support_agg = churn_true_vodafone_support.groupby("bnum").sum()

churn_true_vodafone_support_agg

### Vodafone ALL


In [None]:
churn_true_vodafone = churn_true[churn_true.vodafone]

churn_true_vodafone_agg = churn_true_vodafone.groupby("bnum").sum()

churn_true_vodafone_agg

# INSIGHT: cnt_sms_in. Something wrong with 1020 and 277


In [None]:
df = churn_true_vodafone_agg.cnt_sms_in.reset_index()

df["share"] = df.cnt_sms_in / df.cnt_sms_in.sum()

df.sort_values("share", ascending=False)

# INSIGHT: cnt_sms_out. Something wrong with 5010 and 277


In [None]:
df = churn_true_vodafone_agg.cnt_sms_out.reset_index()

df["share"] = df.cnt_sms_out / df.cnt_sms_out.sum()

df.sort_values("share", ascending=False)

### Compare DIFF between churn and bnum_category


In [None]:
def build_compare_share_dataframe(bnum_df, feature_name):
    new_dataframe = pd.DataFrame({"bnum_category": bnum_df["bnum_category"].unique()})

    churn_true = bnum_df[(bnum_df["target"] == 1)]
    churn_false = bnum_df[(bnum_df["target"] == 0)]

    churn_true_with_shape = build_feature_share_df(churn_true, feature_name).drop(columns=[feature_name])
    churn_false_with_shape = build_feature_share_df(churn_false, feature_name).drop(columns=[feature_name])

    df = churn_true_with_shape.merge(churn_false_with_shape, on="bnum_category", suffixes=("_CHURN", "_NOT_CHURN"))

    df["diff"] = df[f"{feature_name}_share_CHURN"] - df[f"{feature_name}_share_NOT_CHURN"]

    return df.sort_values("diff", ascending=False)

### INSIGHT "cnt_sms_in" action

#### NOT CHURN

1. not churn abonents have bigger amount of `other`
2. not churn abonents have bigger amount of: `bank_companies, taxy_companies, health_companies, delivery_companies, rescue_numbers, shops_companies, cyberpolice_companies, grocery_companies, bank_numbers, competitors_provider_companies, competitors_companies,
work_companies, food_delivery_companies, fraud_numbers`. Більше звʼязків з телефоном (вища активність) -> менше ймовірність відтоку такого абононету

#### CHURN

1. churn clients have much bigger `vodafone_services` . Це може бути повʼязано з надмірною рекламою
2. churn clients have much bigger `messengars_companies` . Це також може бути повʼязано з надмірною рекламою, але через месенджери
3. churn clients have bigger `vodafone_survey` (також може впливати на відплив)
4. churn clients have bigger `credits_companies`
5. churn clients have bigger `phone_companies`
6. churn clients have bigger `verify_numbers`


In [None]:
build_compare_share_dataframe(df_train_bnum, "cnt_sms_in")

### INSIGHT "cnt_sms_out" action

#### NOT CHURN

1. not churn abonents have much bigger share of `taxy_companies` out sms
2. not churn abonents have bigger share of `competitors_numbers, fraud_numbers, bank_numbers` out sms
3. not churn abonents have bigger amount of `vodafone_survey`. Можливо клієнти що проходять survey більш лояльні
4. not churn abonents have bigger `vodafone_new_customer` out sms. Скоріш за все тільки прибувші клієнти

#### CHURN

1. churn clients have bigger `vodafone_services` out sms. Можливо це повʼязано з 5010 сервісом
2. churn clients have bigger `other` out sms
3. churn clinets have bigger amount of `vodafone_support` out sms. Можливо абоненти стикаються з проблемами та йдуть
4. churn clinets have bigger amount of `rescue_numbers`. Можливо щось трапляється


In [None]:
build_compare_share_dataframe(df_train_bnum, "cnt_sms_out")

### INSIGHT "call_cnt_in" action

#### NOT CHURN

1. not churn clients have bigger amount of `competitors_numbers`. Можливо реклама від конкурентів може зменшити бажання клієнта перейти
2. not churn clients have bigger amount `fraud_numbers` of call_cnt_in. Можливо це повʼязано з тим що абонент багато де вспливає в шахрайських базах через те що номер старий, або активно його використовує
3.

#### CHURN

1. churn clients have `much` bigger amount `bank_numbers` call cnt in. Можливо люди яких шукає банк більш схильні змінювати номера телефонів


In [None]:
build_compare_share_dataframe(df_train_bnum, "call_cnt_in")

### INSIGHT "call_cnt_out" action

#### NOT CHURN

1. not churn clients have `much` bigger share of `other` category. Maybe they use it actively
2. not churn clinets have bigger `competitors_numbers`. Можливо в цих юзерів вже є номери конкурентів
3. not churn clinets have bigger `taxy_companies` . Активно використовують ці номера

#### CHURN

1. churn abonents have bigger `vodafone_new_customer` share. Можливо коли нові клієнти часто отримають негативний досвід на початку, вони потім йдуть
2. churn abonents have bigger `fraud_numbers` share. Не знаю як пояснити
3. churn abonents have bigger `rescue_numbers` share. Можливо з клієнтами щось трапляється і вони перестають користуватись послугами
4. churn abonents have `much` bigger share of `vodafone_support` . Скоріш за все клієнти незадоволені сервісом, і саппорт не може їм з цим допомогти


In [None]:
build_compare_share_dataframe(df_train_bnum, "call_cnt_out")

### INSIGHT "call_dur_out" action

#### NOT CHURN

1. not churn clints have `much` higher amount of `other`. Клієнти активно користуються номером телефона
2. not churn clients have bigger amount of `competitors_numbers`. Можливо в них вже є інші номери



#### CHURN

1. churn clients have `much` bigger `vodafone_support`. Клієнти чимось незадоволені, довго не можуть вирішити свою проблему. Може час очікування
2. churn clients have `much` bigger `rescue_numbers`
3. churn clients have bigger amount of `fraud_numbers` Можливо клієнти що схильні до відтоку довше говорять з шахраями
4. churn clients have bigger amount of `vodafone_new_customer` . Нові клієнти стикаються з якимось труднощами
5. churn clients have bigger share of `vodafone_survey`. Можливо варто змінити кількість та тривалість survey дзвінків


In [None]:
build_compare_share_dataframe(df_train_bnum, "call_dur_out")

### INSIGHT "call_dur_in" action

#### NOT CHURN

1. Not churn clinets have bigger amount of `other`
2. Not churn clients have bigger amount of `competitors_numbers` Можливо вже є номери конкурентів
3. Not churn clinets have bigger amount of `fraud_numbers` . Поки не можу сказати чому

#### CHURN

1. churn clients have `much` bigger amount of `bank_numbers` call dur in. Можливо клієнти від яких щось хоче банк, частіше змінюються телефон


In [None]:
build_compare_share_dataframe(df_train_bnum, "call_dur_in")

In [None]:
group0 = churn_false["bnum_category"]
group1 = churn_true["bnum_category"]

# Побудова графіку
plt.figure(figsize=(10, 6))

sns.histplot(group0, label="Target 0", color="blue", stat="density", alpha=0.5, fill=True)
sns.histplot(group1, label="Target 1", color="red", stat="density", alpha=0.5, fill=False)

plt.xlabel("bnum")
plt.ylabel("Frequency")
plt.xticks(rotation=90)
plt.legend()

plt.show()

In [None]:
def compare_boolean_property(dataframe, feature_name, only_true=False):
    churn_false_metrics = dataframe[dataframe["target"] == 0][feature_name].value_counts(normalize=True)
    churn_true_metrics = dataframe[dataframe["target"] == 1][feature_name].value_counts(normalize=True)

    df = pd.DataFrame(
        {"Churn_False": churn_false_metrics, "Churn_True": churn_true_metrics}, index=churn_false_metrics.index
    )
    df.rename(columns={"index": feature_name}, inplace=True)

    plt_df = df

    if only_true:
        plt_df = df[df.index == True]

    x = range(len(plt_df.index))

    plt.bar(x, plt_df["Churn_False"], width=0.4, label="Churn False", align="center")
    plt.bar([p + 0.4 for p in x], plt_df["Churn_True"], width=0.4, label="Churn True", align="center")

    plt.xlabel(feature_name)
    plt.ylabel("Proportion")
    plt.title(f"Comparison of {feature_name} Proportions in Each Group")
    plt.xticks(plt_df.index)
    plt.legend()
    plt.show()

    return df

In [None]:
compare_boolean_property(df_train_bnum, "vodafone", only_true=True)

In [None]:
compare_boolean_property(df_train_bnum, "casual", only_true=True)

## Build features


In [126]:
bnum_topics_list = list(bnum_topics.keys())


def calculate_bnum_feature_sum_metrics(group, feature_name, categories, group_feature_name):
    blank_data = pd.DataFrame({group_feature_name: categories})

    data = group.groupby(group_feature_name).agg({feature_name: "sum"}).reset_index()

    data = data[data[group_feature_name].isin(categories)]
    data = blank_data.merge(data, on=group_feature_name, how="left").fillna(0)

    data_feature_sum = data[feature_name].sum()

    if data_feature_sum > 0:
        data["share"] = data[feature_name] / data_feature_sum * 100
    else:
        data["share"] = 0

    feature_counts = data.set_index(group_feature_name).to_dict()[feature_name]
    feature_counts = {f"{key}_{feature_name}_sum": value for key, value in feature_counts.items()}

    feature_shares = data.set_index(group_feature_name).to_dict()["share"]
    feature_shares = {f"{key}_{feature_name}_sum_share": value for key, value in feature_shares.items()}

    return {
        **feature_counts,
        **feature_shares,
    }


def calculate_bnum_feature_count_metrics(group, feature_name, categories, group_feature_name):
    blank_data = pd.DataFrame({group_feature_name: categories})

    data = group.groupby(group_feature_name).agg({feature_name: "count"}).reset_index()

    data = data[data[group_feature_name].isin(categories)]
    data = blank_data.merge(data, on=group_feature_name, how="left").fillna(0)

    feature_counts = data.set_index(group_feature_name).to_dict()[feature_name]
    feature_counts = {f"{key}_{feature_name}_count": value for key, value in feature_counts.items()}

    return {
        **feature_counts,
    }


def call_cnt_out_bnum_category_metrics(group):
    categories = [
        "other",
        "competitors_numbers",
        "taxy_companies",
        "vodafone_new_customer",
        "fraud_numbers",
        "rescue_numbers",
        "vodafone_support",
    ]

    return {
        **calculate_bnum_feature_sum_metrics(
            group,
            "call_cnt_out",
            categories=categories,
            group_feature_name="bnum_category",
        ),
        **calculate_bnum_feature_count_metrics(
            group,
            "call_cnt_out",
            categories=categories,
            group_feature_name="bnum_category",
        ),
    }


def call_cnt_in_bnum_category_metrics(group):
    categories = [
        "competitors_numbers",
        "fraud_numbers",
        "bank_numbers",
    ]

    return {
        **calculate_bnum_feature_sum_metrics(
            group,
            "call_cnt_in",
            categories=categories,
            group_feature_name="bnum_category",
        ),
        **calculate_bnum_feature_count_metrics(
            group,
            "call_cnt_in",
            categories=categories,
            group_feature_name="bnum_category",
        ),
    }


def call_dur_out_bnum_category_metrics(group):
    categories = [
        "other",
        "competitors_numbers",
        "vodafone_support",
        "rescue_numbers",
        "fraud_numbers",
        "vodafone_new_customer",
        "vodafone_survey",
    ]

    return {
        **calculate_bnum_feature_sum_metrics(
            group,
            "call_dur_out",
            categories=categories,
            group_feature_name="bnum_category",
        ),
        **calculate_bnum_feature_count_metrics(
            group,
            "call_dur_out",
            categories=categories,
            group_feature_name="bnum_category",
        ),
    }


def call_dur_in_bnum_category_metrics(group):
    categories = [
        "other",
        "competitors_numbers",
        "fraud_numbers",
        "bank_numbers",
    ]

    return {
        **calculate_bnum_feature_sum_metrics(
            group,
            "call_dur_in",
            categories=categories,
            group_feature_name="bnum_category",
        ),
        **calculate_bnum_feature_count_metrics(
            group,
            "call_dur_in",
            categories=categories,
            group_feature_name="bnum_category",
        ),
    }


def cnt_sms_out_bnum_category_metrics(group):
    categories = [
        "taxy_companies",
        "competitors_numbers",
        "fraud_numbers",
        "bank_numbers",
        "vodafone_survey",
        "vodafone_new_customer",
        "vodafone_services",
        "other",
        "vodafone_support",
        "rescue_numbers",
    ]

    return {
        **calculate_bnum_feature_sum_metrics(
            group,
            "cnt_sms_out",
            categories=categories,
            group_feature_name="bnum_category",
        ),
        **calculate_bnum_feature_count_metrics(
            group,
            "cnt_sms_out",
            categories=categories,
            group_feature_name="bnum_category",
        ),
    }


def cnt_sms_in_bnum_category_metrics(group):
    categories = [
        "other",
        "bank_companies",
        "vodafone_services",
        "messengars_companies",
        "vodafone_survey",
        "credits_companies",
        "phone_companies",
        "verify_numbers",
    ]

    return {
        **calculate_bnum_feature_sum_metrics(
            group,
            "cnt_sms_in",
            categories=categories,
            group_feature_name="bnum_category",
        ),
        **calculate_bnum_feature_count_metrics(
            group,
            "cnt_sms_in",
            categories=categories,
            group_feature_name="bnum_category",
        ),
    }


def call_cnt_out_bnum_topic_metrics(group):
    return {
        **calculate_bnum_feature_sum_metrics(
            group,
            "call_cnt_out",
            categories=bnum_topics_list,
            group_feature_name="bnum_topic",
        ),
        **calculate_bnum_feature_count_metrics(
            group,
            "call_cnt_out",
            categories=bnum_topics_list,
            group_feature_name="bnum_topic",
        ),
    }


def call_cnt_in_bnum_topic_metrics(group):
    return {
        **calculate_bnum_feature_sum_metrics(
            group,
            "call_cnt_in",
            categories=bnum_topics_list,
            group_feature_name="bnum_topic",
        ),
        **calculate_bnum_feature_count_metrics(
            group,
            "call_cnt_in",
            categories=bnum_topics_list,
            group_feature_name="bnum_topic",
        ),
    }


def call_dur_out_bnum_topic_metrics(group):
    return {
        **calculate_bnum_feature_sum_metrics(
            group,
            "call_dur_out",
            categories=bnum_topics_list,
            group_feature_name="bnum_topic",
        ),
        **calculate_bnum_feature_count_metrics(
            group,
            "call_dur_out",
            categories=bnum_topics_list,
            group_feature_name="bnum_topic",
        ),
    }


def call_dur_in_bnum_topic_metrics(group):
    return {
        **calculate_bnum_feature_sum_metrics(
            group,
            "call_dur_in",
            categories=bnum_topics_list,
            group_feature_name="bnum_topic",
        ),
        **calculate_bnum_feature_count_metrics(
            group,
            "call_dur_in",
            categories=bnum_topics_list,
            group_feature_name="bnum_topic",
        ),
    }


def cnt_sms_out_bnum_topic_metrics(group):
    return {
        **calculate_bnum_feature_sum_metrics(
            group,
            "cnt_sms_out",
            categories=bnum_topics_list,
            group_feature_name="bnum_topic",
        ),
        **calculate_bnum_feature_count_metrics(
            group,
            "cnt_sms_out",
            categories=bnum_topics_list,
            group_feature_name="bnum_topic",
        ),
    }


def cnt_sms_in_bnum_topic_metrics(group):
    return {
        **calculate_bnum_feature_sum_metrics(
            group,
            "cnt_sms_in",
            categories=bnum_topics_list,
            group_feature_name="bnum_topic",
        ),
        **calculate_bnum_feature_count_metrics(
            group,
            "cnt_sms_in",
            categories=bnum_topics_list,
            group_feature_name="bnum_topic",
        ),
    }


def bnum_category_metrics(group):
    categories = [
        "credits_companies",
        "messengars_companies",
        "vodafone_support",
        "vodafone_survey",
        "vodafone_services",
        "bank_numbers",
        "phone_companies",
    ]

    bnum_category_counts = group.bnum_category.value_counts(normalize=True)

    categories_metrics = {}

    for category in categories:
        categories_metrics[f"{category}_category_act_share"] = bnum_category_counts.get(category, 0)
        categories_metrics[f"{category}_category_act_count"] = group[group["bnum_category"] == category].shape[0]

    return categories_metrics


def bnum_topic_metrics(group):
    bnum_topic_counts = group.bnum_topic.value_counts(normalize=True)

    topics_metrics = {}

    for bnum_topic in bnum_topics:
        topics_metrics[f"{bnum_topic}_topic_act_share"] = bnum_topic_counts.get(bnum_topic, 0)
        topics_metrics[f"{bnum_topic}_topic_act_count"] = group[group["bnum_topic"] == bnum_topic].shape[0]

    return topics_metrics


def high_level_spicific_metrics(group):
    return {
        "vodafone_act_share": group["vodafone"].value_counts(normalize=True).get(True, 0),
        "vodafone_act_count": group[group["vodafone"]].shape[0],
        "casual_act_share": group["casual"].value_counts(normalize=True).get(True, 0),
        "casual_act_count": group[group["casual"]].shape[0],
    }


def build_user_bnum_metrics_group(group):
    metrics = {
        "abon_id": group.iloc[0].abon_id,
        **high_level_spicific_metrics(group),
        **bnum_topic_metrics(group),
        **bnum_category_metrics(group),
        # Bnum topic metrics
        **call_cnt_out_bnum_topic_metrics(group),
        **call_cnt_in_bnum_topic_metrics(group),
        **call_dur_out_bnum_topic_metrics(group),
        **call_dur_in_bnum_topic_metrics(group),
        **cnt_sms_out_bnum_topic_metrics(group),
        **cnt_sms_in_bnum_topic_metrics(group),
        # Bnum category metrics
        **call_cnt_out_bnum_category_metrics(group),
        **call_cnt_in_bnum_category_metrics(group),
        **call_dur_out_bnum_category_metrics(group),
        **call_dur_in_bnum_category_metrics(group),
        **cnt_sms_out_bnum_category_metrics(group),
        **cnt_sms_in_bnum_category_metrics(group),
    }

    return pd.Series(metrics)

In [127]:
import pandas as pd

# Load data
data_loader = LoadData()
df_train_bnum = data_loader.df_train_bnum.copy().head(1000)

# Assign target
df_train_bnum["target"] = 0
abon_target = data_loader.df_train_fe[["target", "abon_id"]]
df_train_bnum["target"] = df_train_bnum["abon_id"].map(abon_target.set_index("abon_id")["target"])
df_train_bnum["abon_id"] = df_train_bnum["abon_id"].astype(int)

task = 0
task_total = len(df_train_bnum.abon_id.unique())

# Assign bnum category
df_train_bnum = assign_bnum_category(df_train_bnum, bnum_categories_dict)

# Split the DataFrame into groups by abon_id
df_new = df_train_bnum.groupby("abon_id").apply(build_user_bnum_metrics_group).reset_index(drop=True)

# Format abon_id
df_new["abon_id"] = df_new["abon_id"].astype(int)

# Assign target
abon_target.set_index("abon_id", inplace=True)
df_new["target"] = df_new["abon_id"].map(abon_target["target"])

df_new

Progress: 0.54%
Progress: 1.08%
Progress: 1.62%
Progress: 2.16%
Progress: 2.70%
Progress: 3.24%
Progress: 3.78%
Progress: 4.32%
Progress: 4.86%
Progress: 5.41%
Progress: 5.95%
Progress: 6.49%
Progress: 7.03%
Progress: 7.57%
Progress: 8.11%
Progress: 8.65%
Progress: 9.19%
Progress: 9.73%
Progress: 10.27%
Progress: 10.81%
Progress: 11.35%
Progress: 11.89%
Progress: 12.43%
Progress: 12.97%
Progress: 13.51%
Progress: 14.05%
Progress: 14.59%
Progress: 15.14%
Progress: 15.68%
Progress: 16.22%
Progress: 16.76%
Progress: 17.30%
Progress: 17.84%
Progress: 18.38%
Progress: 18.92%
Progress: 19.46%
Progress: 20.00%
Progress: 20.54%
Progress: 21.08%
Progress: 21.62%
Progress: 22.16%
Progress: 22.70%
Progress: 23.24%
Progress: 23.78%
Progress: 24.32%
Progress: 24.86%
Progress: 25.41%
Progress: 25.95%
Progress: 26.49%
Progress: 27.03%
Progress: 27.57%
Progress: 28.11%
Progress: 28.65%
Progress: 29.19%
Progress: 29.73%
Progress: 30.27%
Progress: 30.81%
Progress: 31.35%
Progress: 31.89%
Progress: 32.43

  df_new = df_train_bnum.groupby("abon_id").apply(build_user_bnum_metrics_group).reset_index(drop=True)


Unnamed: 0,abon_id,vodafone_act_share,vodafone_act_count,casual_act_share,casual_act_count,finance_topic_topic_act_share,finance_topic_topic_act_count,vodafone_topic_topic_act_share,vodafone_topic_topic_act_count,casual_topic_topic_act_share,...,vodafone_survey_cnt_sms_in_sum_share,bank_companies_cnt_sms_in_count,credits_companies_cnt_sms_in_count,messengars_companies_cnt_sms_in_count,other_cnt_sms_in_count,phone_companies_cnt_sms_in_count,verify_numbers_cnt_sms_in_count,vodafone_services_cnt_sms_in_count,vodafone_survey_cnt_sms_in_count,target
0,1545052,0.0,0.0,0.500000,1.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1545235,0.0,0.0,0.666667,4.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
2,1549591,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1558772,0.0,0.0,0.333333,1.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1558921,0.0,0.0,0.000000,0.0,0.333333,1.0,0.0,0.0,0.666667,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,1701250,0.0,0.0,0.200000,1.0,0.400000,2.0,0.0,0.0,0.400000,...,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
181,1701558,0.0,0.0,0.500000,2.0,0.250000,1.0,0.0,0.0,0.750000,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
182,1702032,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,1702599,0.0,0.0,0.800000,4.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0


In [129]:
df_new_refactored

Unnamed: 0,abon_id,vodafone_act_share,vodafone_act_count,casual_act_share,casual_act_count,finance_topic_topic_act_share,finance_topic_topic_act_count,vodafone_topic_topic_act_share,vodafone_topic_topic_act_count,casual_topic_topic_act_share,...,vodafone_survey_cnt_sms_in_sum_share,bank_companies_cnt_sms_in_count,credits_companies_cnt_sms_in_count,messengars_companies_cnt_sms_in_count,other_cnt_sms_in_count,phone_companies_cnt_sms_in_count,verify_numbers_cnt_sms_in_count,vodafone_services_cnt_sms_in_count,vodafone_survey_cnt_sms_in_count,target
0,1545052,0.0,0.0,0.500000,1.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1545235,0.0,0.0,0.666667,4.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
2,1549591,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1558772,0.0,0.0,0.333333,1.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1558921,0.0,0.0,0.000000,0.0,0.333333,1.0,0.0,0.0,0.666667,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,1701250,0.0,0.0,0.200000,1.0,0.400000,2.0,0.0,0.0,0.400000,...,0.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
181,1701558,0.0,0.0,0.500000,2.0,0.250000,1.0,0.0,0.0,0.750000,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
182,1702032,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,1702599,0.0,0.0,0.800000,4.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0


In [130]:
df_new_refactoreddf_new_origin

Unnamed: 0,abon_id,vodafone_act_share,vodafone_act_count,casual_act_share,casual_act_count,finance_topic_topic_act_share,finance_topic_topic_act_count,vodafone_topic_topic_act_share,vodafone_topic_topic_act_count,casual_topic_topic_act_share,...,verify_numbers_cnt_sms_in_sum_share,other_cnt_sms_in_count,bank_companies_cnt_sms_in_count,vodafone_services_cnt_sms_in_count,messengars_companies_cnt_sms_in_count,vodafone_survey_cnt_sms_in_count,credits_companies_cnt_sms_in_count,phone_companies_cnt_sms_in_count,verify_numbers_cnt_sms_in_count,target
0,1545052,0.0,0.0,0.500000,1.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1545235,0.0,0.0,0.666667,4.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1549591,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1558772,0.0,0.0,0.333333,1.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1558921,0.0,0.0,0.000000,0.0,0.333333,1.0,0.0,0.0,0.666667,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,1701250,0.0,0.0,0.200000,1.0,0.400000,2.0,0.0,0.0,0.400000,...,0.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
181,1701558,0.0,0.0,0.500000,2.0,0.250000,1.0,0.0,0.0,0.750000,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
182,1702032,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
183,1702599,0.0,0.0,0.800000,4.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [195]:
columns = list(set(list(df_new_origin.columns) + list(df_new_refactored.columns)))

df_new_origin[columns].compare(df_new_refactored[columns])

Unnamed: 0_level_0,vodafone_services_cnt_sms_in_sum_share,vodafone_services_cnt_sms_in_sum_share,bank_companies_cnt_sms_in_sum_share,bank_companies_cnt_sms_in_sum_share,credits_companies_cnt_sms_in_sum_share,credits_companies_cnt_sms_in_sum_share,other_cnt_sms_in_sum_share,other_cnt_sms_in_sum_share,casual_topic_cnt_sms_in_sum_share,casual_topic_cnt_sms_in_sum_share,...,phone_companies_cnt_sms_in_sum_share,phone_companies_cnt_sms_in_sum_share,vodafone_survey_cnt_sms_in_sum_share,vodafone_survey_cnt_sms_in_sum_share,messengars_topic_cnt_sms_in_sum_share,messengars_topic_cnt_sms_in_sum_share,vodafone_topic_cnt_sms_in_sum_share,vodafone_topic_cnt_sms_in_sum_share,finance_topic_cnt_sms_in_sum_share,finance_topic_cnt_sms_in_sum_share
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,...,self,other,self,other,self,other,self,other,self,other
12,,,,,,,,,,,...,,,,,10.043106,10.043106,32.471137,32.471137,32.471137,32.471137
15,,,13.39707,13.39707,,,,,,,...,,,,,,,,,,
20,,,,,,,,,,,...,,,,,,,,,5.2948,5.2948
40,,,25.208694,25.208694,,,72.909655,72.909655,,,...,,,,,,,,,,
78,,,,,,,,,,,...,,,,,,,13.174663,13.174663,,
108,,,,,,,,,,,...,,,,,,,,,68.500713,68.500713
110,,,,,,,,,58.181037,58.181037,...,,,,,,,,,,
150,,,24.238371,24.238371,21.227573,21.227573,54.534055,54.534055,,,...,,,,,,,,,,
157,17.643362,17.643362,10.845362,10.845362,,,40.713965,40.713965,,,...,,,23.522087,23.522087,,,,,,
170,8.989384,8.989384,44.736845,44.736845,,,20.516074,20.516074,34.355024,34.355024,...,4.711928,4.711928,,,18.476823,18.476823,7.892098,7.892098,39.276055,39.276055


In [196]:
print(df_new_origin.iloc[157].vodafone_services_cnt_sms_in_sum_share)
print(df_new_refactored.iloc[157].vodafone_services_cnt_sms_in_sum_share)

17.643361968827094
17.643361968827097
