In [1]:
import numpy as np
import pandas as pd
import wrangle

In [2]:
#gets tenure info for 2 year contract holders
df = wrangle.get_telco_tenure()

In [3]:
#peek at data
df.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.7,71,7904.25
1,0014-BMAQU,84.65,63,5377.8
2,0016-QLJIS,90.45,65,5957.9
3,0017-DINOC,45.2,54,2460.55
4,0017-IUDMW,116.8,72,8456.75


In [4]:
#gets rows and cols
df.shape

(1695, 4)

In [5]:
# convert possible whitespace into nulls , check for nulls,check for correct datatypes
df = df.replace(r'^\s*$', np.NaN, regex=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      1695 non-null   object 
 1   monthly_charges  1695 non-null   float64
 2   tenure           1695 non-null   int64  
 3   total_charges    1685 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 53.1+ KB


In [6]:
#data type for total charges should be a float and not a object
df.isna().sum()

customer_id         0
monthly_charges     0
tenure              0
total_charges      10
dtype: int64

In [7]:
#total charges has 10 nulls, lets see what they are

In [8]:
null_values = pd.isnull(df['total_charges'])

In [9]:
#everyone who has a missing total charge has not had there first payment post
df[null_values]

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
234,1371-DWPAZ,56.05,0,
416,2520-SGTTA,20.0,0,
453,2775-SEFEE,61.9,0,
505,3115-CZMZD,20.25,0,
524,3213-VVOLG,25.35,0,
678,4075-WKNIU,73.35,0,
716,4367-NUYAO,25.75,0,
726,4472-LVYGI,52.55,0,
941,5709-LVOEQ,80.85,0,
1293,7644-OMVMY,19.85,0,


In [10]:
# for the scope of the project. we can assume that these payments will post and everyone
# in this subset will have a tenure of 1 and total_charges will  = there monthly charge

In [11]:
df['total_charges'].fillna(df['monthly_charges'], inplace = True)

In [12]:
df.isnull().sum()

customer_id        0
monthly_charges    0
tenure             0
total_charges      0
dtype: int64

In [13]:
#lets see if those nulls were filled correctly
df[null_values]

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
234,1371-DWPAZ,56.05,0,56.05
416,2520-SGTTA,20.0,0,20.0
453,2775-SEFEE,61.9,0,61.9
505,3115-CZMZD,20.25,0,20.25
524,3213-VVOLG,25.35,0,25.35
678,4075-WKNIU,73.35,0,73.35
716,4367-NUYAO,25.75,0,25.75
726,4472-LVYGI,52.55,0,52.55
941,5709-LVOEQ,80.85,0,80.85
1293,7644-OMVMY,19.85,0,19.85


In [14]:
#we can now make an attempt to turn total charges into a float

In [15]:
#attempts to convert column contents into a numeric value, if it cant it will remain a null
df['total_charges'] = pd.to_numeric(df['total_charges'],errors='coerce')

In [16]:
#lets see if it worked
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      1695 non-null   object 
 1   monthly_charges  1695 non-null   float64
 2   tenure           1695 non-null   int64  
 3   total_charges    1695 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 53.1+ KB


In [17]:
#now we can move their tenure to 1
df.loc[df['tenure'] == 0, 'tenure'] = 1

In [18]:
#these changes are now apart of the data frame
df[null_values]

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
234,1371-DWPAZ,56.05,1,56.05
416,2520-SGTTA,20.0,1,20.0
453,2775-SEFEE,61.9,1,61.9
505,3115-CZMZD,20.25,1,20.25
524,3213-VVOLG,25.35,1,25.35
678,4075-WKNIU,73.35,1,73.35
716,4367-NUYAO,25.75,1,25.75
726,4472-LVYGI,52.55,1,52.55
941,5709-LVOEQ,80.85,1,80.85
1293,7644-OMVMY,19.85,1,19.85


In [19]:
#we will use these cleaning practices in wrangle.py. lets see it in action

In [20]:
newdf = wrangle.get_telco_tenure()

In [21]:
newdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      1695 non-null   object 
 1   monthly_charges  1695 non-null   float64
 2   tenure           1695 non-null   int64  
 3   total_charges    1685 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 53.1+ KB


In [22]:
newdf[null_values]

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
234,1371-DWPAZ,56.05,0,
416,2520-SGTTA,20.0,0,
453,2775-SEFEE,61.9,0,
505,3115-CZMZD,20.25,0,
524,3213-VVOLG,25.35,0,
678,4075-WKNIU,73.35,0,
716,4367-NUYAO,25.75,0,
726,4472-LVYGI,52.55,0,
941,5709-LVOEQ,80.85,0,
1293,7644-OMVMY,19.85,0,


In [23]:
#after df

In [24]:
after_df = wrangle.clean_telco_tenure(newdf)

In [25]:
after_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      1695 non-null   object 
 1   monthly_charges  1695 non-null   float64
 2   tenure           1695 non-null   int64  
 3   total_charges    1695 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 53.1+ KB


In [26]:
after_df[null_values]

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
234,1371-DWPAZ,56.05,1,56.05
416,2520-SGTTA,20.0,1,20.0
453,2775-SEFEE,61.9,1,61.9
505,3115-CZMZD,20.25,1,20.25
524,3213-VVOLG,25.35,1,25.35
678,4075-WKNIU,73.35,1,73.35
716,4367-NUYAO,25.75,1,25.75
726,4472-LVYGI,52.55,1,52.55
941,5709-LVOEQ,80.85,1,80.85
1293,7644-OMVMY,19.85,1,19.85


In [27]:
#split data

In [29]:
train, validate, test = wrangle.telco_split(after_df)

train(949, 4),validate(407, 4),test(339, 4)
