In [113]:
import sys
# !{sys.executable} -m pip install matplotlib
# !{sys.executable} -m pip install seaborn

In [114]:
# hide warnings
import warnings
warnings.filterwarnings('ignore')

# show multiple outputs in jupyter cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [145]:
# Importing Pandas and NumPy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

In [116]:
df = pd.read_csv("telecom_churn_data.csv")
df.shape

(99999, 226)

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Columns: 226 entries, mobile_number to sep_vbc_3g
dtypes: float64(179), int64(35), object(12)
memory usage: 172.4+ MB


In [118]:
df.columns

Index(['mobile_number', 'circle_id', 'loc_og_t2o_mou', 'std_og_t2o_mou',
       'loc_ic_t2o_mou', 'last_date_of_month_6', 'last_date_of_month_7',
       'last_date_of_month_8', 'last_date_of_month_9', 'arpu_6',
       ...
       'sachet_3g_9', 'fb_user_6', 'fb_user_7', 'fb_user_8', 'fb_user_9',
       'aon', 'aug_vbc_3g', 'jul_vbc_3g', 'jun_vbc_3g', 'sep_vbc_3g'],
      dtype='object', length=226)

In [119]:
df["total_rech_good_phase"] = (df.total_rech_num_6 + df.total_rech_num_7)/2
high_value_cutoff = np.percentile(df["total_rech_good_phase"], 70)
high_value_cutoff

8.5

In [120]:
df = df.loc[df.total_rech_good_phase >= high_value_cutoff]
df.shape

(30658, 227)

In [121]:
df['churn'] = df.total_ic_mou_9 + df.total_og_mou_9 + df.vol_2g_mb_9 + df.vol_3g_mb_9
df.churn = df.churn.apply(lambda x: 1 if x==0 else 0)
df.churn.value_counts()

0    27720
1     2938
Name: churn, dtype: int64

In [122]:
cols = [c for c in df.columns if c[-2:] == '_9']
df.drop(axis=1, columns =cols, inplace=True)
cols

['last_date_of_month_9',
 'arpu_9',
 'onnet_mou_9',
 'offnet_mou_9',
 'roam_ic_mou_9',
 'roam_og_mou_9',
 'loc_og_t2t_mou_9',
 'loc_og_t2m_mou_9',
 'loc_og_t2f_mou_9',
 'loc_og_t2c_mou_9',
 'loc_og_mou_9',
 'std_og_t2t_mou_9',
 'std_og_t2m_mou_9',
 'std_og_t2f_mou_9',
 'std_og_t2c_mou_9',
 'std_og_mou_9',
 'isd_og_mou_9',
 'spl_og_mou_9',
 'og_others_9',
 'total_og_mou_9',
 'loc_ic_t2t_mou_9',
 'loc_ic_t2m_mou_9',
 'loc_ic_t2f_mou_9',
 'loc_ic_mou_9',
 'std_ic_t2t_mou_9',
 'std_ic_t2m_mou_9',
 'std_ic_t2f_mou_9',
 'std_ic_t2o_mou_9',
 'std_ic_mou_9',
 'total_ic_mou_9',
 'spl_ic_mou_9',
 'isd_ic_mou_9',
 'ic_others_9',
 'total_rech_num_9',
 'total_rech_amt_9',
 'max_rech_amt_9',
 'date_of_last_rech_9',
 'last_day_rch_amt_9',
 'date_of_last_rech_data_9',
 'total_rech_data_9',
 'max_rech_data_9',
 'count_rech_2g_9',
 'count_rech_3g_9',
 'av_rech_amt_data_9',
 'vol_2g_mb_9',
 'vol_3g_mb_9',
 'arpu_3g_9',
 'arpu_2g_9',
 'night_pck_user_9',
 'monthly_2g_9',
 'sachet_2g_9',
 'monthly_3g_9',
 

In [123]:
df.shape

(30658, 174)

In [124]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
df.isnull().sum()
pd.reset_option('all')

mobile_number                   0
circle_id                       0
loc_og_t2o_mou                 12
std_og_t2o_mou                 12
loc_ic_t2o_mou                 12
last_date_of_month_6            0
last_date_of_month_7           22
last_date_of_month_8          139
arpu_6                          0
arpu_7                          0
arpu_8                          0
onnet_mou_6                   214
onnet_mou_7                   250
onnet_mou_8                  1052
offnet_mou_6                  214
offnet_mou_7                  250
offnet_mou_8                 1052
roam_ic_mou_6                 214
roam_ic_mou_7                 250
roam_ic_mou_8                1052
roam_og_mou_6                 214
roam_og_mou_7                 250
roam_og_mou_8                1052
loc_og_t2t_mou_6              214
loc_og_t2t_mou_7              250
loc_og_t2t_mou_8             1052
loc_og_t2m_mou_6              214
loc_og_t2m_mou_7              250
loc_og_t2m_mou_8             1052
loc_og_t2f_mou


: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



In [134]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
df.isnull().sum()
pd.reset_option('all')

mobile_number                   0
arpu_6                          0
arpu_7                          0
arpu_8                          0
onnet_mou_6                     0
onnet_mou_7                     0
onnet_mou_8                     0
offnet_mou_6                    0
offnet_mou_7                    0
offnet_mou_8                    0
roam_ic_mou_6                   0
roam_ic_mou_7                   0
roam_ic_mou_8                   0
roam_og_mou_6                   0
roam_og_mou_7                   0
roam_og_mou_8                   0
loc_og_t2t_mou_6                0
loc_og_t2t_mou_7                0
loc_og_t2t_mou_8                0
loc_og_t2m_mou_6                0
loc_og_t2m_mou_7                0
loc_og_t2m_mou_8                0
loc_og_t2f_mou_6                0
loc_og_t2f_mou_7                0
loc_og_t2f_mou_8                0
loc_og_t2c_mou_6                0
loc_og_t2c_mou_7                0
loc_og_t2c_mou_8                0
loc_og_mou_6                    0
loc_og_mou_7  


: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



In [126]:
df.circle_id.unique()

df.loc_og_t2o_mou.unique()
df.std_og_t2o_mou.unique()
df.loc_ic_t2o_mou.unique()

df.last_date_of_month_6.unique()
df.last_date_of_month_7.unique()
df.last_date_of_month_8.unique()

df.std_og_t2c_mou_6.unique()
df.std_og_t2c_mou_7.unique()
df.std_og_t2c_mou_8.unique()

df.std_ic_t2o_mou_6.unique()
df.std_ic_t2o_mou_7.unique()
df.std_ic_t2o_mou_8.unique()

df.drop(axis=1, columns =['circle_id', 'loc_og_t2o_mou', 'std_og_t2o_mou', 'loc_ic_t2o_mou',
                         'last_date_of_month_6', 'last_date_of_month_7', 'last_date_of_month_8',
                         'std_og_t2c_mou_6', 'std_og_t2c_mou_7', 'std_og_t2c_mou_8'], inplace=True)

array([109], dtype=int64)

array([ 0., nan])

array([ 0., nan])

array([ 0., nan])

array(['6/30/2014'], dtype=object)

array(['7/31/2014', nan], dtype=object)

array(['8/31/2014', nan], dtype=object)

array([ 0., nan])

array([ 0., nan])

array([ 0., nan])

array([ 0., nan])

array([ 0., nan])

array([ 0., nan])

In [135]:
df.shape

(29350, 164)

In [133]:
df = df.loc[~df.onnet_mou_6.isnull()]
df = df.loc[~df.onnet_mou_7.isnull()]
df = df.loc[~df.onnet_mou_8.isnull()]

In [136]:
df.date_of_last_rech_8.describe()

count         29223
unique           31
top       8/31/2014
freq           8085
Name: date_of_last_rech_8, dtype: object

In [142]:
df['date_of_last_rech_6'] =  pd.to_datetime(df['date_of_last_rech_6'], format='%m/%d/%Y')
df['date_of_last_rech_7'] =  pd.to_datetime(df['date_of_last_rech_7'], format='%m/%d/%Y')
df['date_of_last_rech_8'] =  pd.to_datetime(df['date_of_last_rech_8'], format='%m/%d/%Y')

In [143]:
# df['date_of_last_rech_8'].fillna('7/31/2014')
# df['date_of_last_rech_7'].fillna('6/30/2014')


In [153]:
df['days_since_last_rech'] = datetime(2014, 9, 1, 12, 0, 0) - df['date_of_last_rech_8']

In [156]:
df['days_since_last_rech'] = df['days_since_last_rech'].apply(lambda x: float(x.days))

AttributeError: 'float' object has no attribute 'days'

In [157]:
df['days_since_last_rech']

3        1.0
6        4.0
8        2.0
17       7.0
21       1.0
        ... 
99970    9.0
99980    1.0
99984    5.0
99987    2.0
99988    3.0
Name: days_since_last_rech, Length: 29350, dtype: float64