# **TELCO CHURN PREDICTION**

## **A. All Packages be used on this project**

In [6]:
# packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## **B. DATASET**

In [69]:
# load dataset
from dotenv import load_dotenv
import os
load_dotenv()  

# churn_dataset
churn_dataset = os.getenv("churn_dataset")
df_churn = pd.read_excel(churn_dataset, sheet_name="Sheet1")

# LIS_dataset
lis_dataset = os.getenv("lis_dataset_1")
df_lis = pd.read_excel(lis_dataset, sheet_name="lis_telco")


## **C. DATA PRE-PROCESSING**

### **C.1 CHURN DATASET**

#### **C.1.1 DATA FORMATTING**

* pada dataset raw_churn, data TGL_PSB da TGL_PS masih bertipe numeric, sehingga perlu di transformasi kedalam tipe date

In [59]:
# Date Handling
# convert raw_churn[['TGL_PSB','TGL_PS']] to datetime
df_churn = raw_churn.copy()
df_churn['TGL_PSB'] = pd.to_datetime(df_churn['TGL_PSB'], format='%Y%m%d')
df_churn['TGL_PS'] = pd.to_datetime(df_churn['TGL_PS'], format='%Y%m%d')

#### **C.1.2 DATA CLASSIFICATION**

* Mapping kategori Length of Stay pada  df_churn ['LENGTH OF STAY CAT'] adalah sebagai berikut : 
  - < 6 Bulan = 1-6 months
  - 6 - 12 Bulan = 7-12 months 
  - 1-2 Tahun = 13-24 months
  - 2-3 Tahun = 25-36 months
  - 3-4 Tahun = 37-48 months
  - 4-5 Tahun = 49-60 months
  - &gt; 5 tahun = > 60 months


In [60]:
# expected output : map all not null value in TGL_PSB and TGL_PS to "Length of Stay Cat"

# make a new column MONTH_DIFF
df_churn['MONTH_DIFF'] = ((df_churn['TGL_PS'].dt.year - df_churn['TGL_PSB'].dt.year) * 12) + (df_churn['TGL_PS'].dt.month - df_churn['TGL_PSB'].dt.month)

# do mapping -> see the map_lenth_of_stay function on python_function folder
import sys
sys.path.append('../python_function') 
from length_of_stay_map import map_length_of_stay

df_churn['LENGTH OF STAY CAT'] = map_length_of_stay(df_churn['MONTH_DIFF'])

# check the result
print(df_churn.loc[df_churn['LENGTH OF STAY CAT']=='Unidentified',['TGL_PSB','TGL_PS']])

Empty DataFrame
Columns: [TGL_PSB, TGL_PS]
Index: []


## **C.2 LIS DATASET**

### **C.2.1 DATA CLEANING**

pertama, kita akan cleaning dataset dengan step yang dilakukan :
  - membuang kolom yang tidak penting
  

In [None]:
raw_lis = df_lis.copy()

# drop unecessary columns
raw_lis = raw_lis.drop(columns=['KW_IH','CITEM','PRODTYPE','IS_INDIHOME','CEK_WICO','PERIOD','PERIOD_PS','UNIT2','CEK_LGEST','INET_BASIC','LGEST','PACK_NAME','PLBLCL','Rev (Rp M)','TREMS_REV_P','ADDON','ALAMAT','AMBANG2','ASSET_ROWID','BA_ROWID','BLN_TGK','CA_ROWID','Calculation2','CEK_CGEST','CEK_2P','CEK_3P','CEK_HSIE','CEK_HSSP','CEK_HSSP (copy)','CEK_LOY','CEK_P_HSI','CEK_TO','CGEST','CITEM_EXT','EKOSISTEM','EMAIL','F_PRIORITY','F_PRIORITY_DESC','GROUP_HSIE','GROUP_PRO_HW','GRP_PRODUK','HARGA_HSI','HARGA_ONT','HSIE RECOM','HSSP RECOM','IHOME_INV','INET_OTHERS','IPSTATIC','JKW1','JKW2','JKW3','JKW4','KET_MIGRASI','KET_RESL','KET_SELLER','KW_IH2','KW_POTS','KW_UM_TXT','KW_UMUR','L_AMBANG','L_KW_UMUR','LINECATS_ITEM_ID','LINKAR','LOY_PROGRAM','NAMA','NAMA_AM','ND_REFERENCE','NDOS','NIK_AM','NIPNAS','NO_HP','P_DIGITAL','PKW1','PKW2','PKW3','PKW4','PRIO_MIG','PRIORITAS_MIG','PRIORITY','PRODUK2','REALM','RECOM_DIGI','REV (Jt)','REV INET(M)','REV VOICE (M)','REV_TXT','REV(M)','REVENUE_ADDON_INET','REVENUE_ADDON_TV','REVENUE_INET','REVENUE_TV','RID','ROOT_ASSET_ID','SELISIH','SELISIH2','SPEED','TAG_OTHER','TAHUN_PS','TECHNO','titik','TOT_TGK','TREMS_REV','TREMS_REV_REF','UNIT (copy)','UMUR PLG (Th)'],axis=1)
