In [1]:
# importing libraries
import numpy as np
import pandas as pd

In [2]:
# reading customer care dataset
df = pd.read_csv('Customer Care.csv')

In [3]:
# inspecting first few rows of the dataset
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column,Unnamed: 8,Unnamed: 9
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True,,
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False,,
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True,,
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True,,
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True,,


In [4]:
# checking the basic structure and datatypes of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CustomerID         21 non-null     int64  
 1   First_Name         21 non-null     object 
 2   Last_Name          20 non-null     object 
 3   Phone_Number       19 non-null     object 
 4   Address            21 non-null     object 
 5   Paying Customer    21 non-null     object 
 6   Do_Not_Contact     17 non-null     object 
 7   Not_Useful_Column  21 non-null     bool   
 8   Unnamed: 8         0 non-null      float64
 9   Unnamed: 9         0 non-null      float64
dtypes: bool(1), float64(2), int64(1), object(6)
memory usage: 1.6+ KB


In [5]:
# dropping irrelevant or unnamed columns from the dataset
df = df.drop(['Not_Useful_Column', 'Unnamed: 8','Unnamed: 9'], axis = 1)
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No


In [6]:
# checking for duplicate records in the dataset
print(f'Dataframe contains {sum(df.duplicated())} duplicate records.')

Dataframe contains 1 duplicate records.


In [7]:
# removing duplicate records
df = df.drop_duplicates()

In [8]:
# cleaning last name
df['Last_Name'] = df['Last_Name'].str.strip('/._')
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes
2,1003,Walter,White,7066950392,298 Drugs Driveway,N,
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No


In [9]:
# removing non-alphanumeric characters and handling 'Na' as missing values in Phone_Number column
df['Phone_Number'] = df['Phone_Number'].replace('[^a-zA-Z0-9]', '', regex = True)
df['Phone_Number'] = df['Phone_Number'].replace('Na', np.nan)
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
0,1001,Frodo,Baggins,1235455421,"123 Shire Lane, Shire",Yes,No
1,1002,Abed,Nadir,1236439775,93 West Main Street,No,Yes
2,1003,Walter,White,7066950392,298 Drugs Driveway,N,
3,1004,Dwight,Schrute,1235432345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y
4,1005,Jon,Snow,8766783469,123 Dragons Road,Y,No


In [10]:
# reformatting phone numbers to standard format (XXX-XXX-XXXX) and handling invalid entries
df['Phone_Number'] = df['Phone_Number'].apply(lambda x: str(x))
df['Phone_Number'] = df['Phone_Number'].apply(lambda x: x[:3] + '-' + x[3:6] + '-' + x[6:])
df['Phone_Number'] = df['Phone_Number'].replace('nan--', np.nan)
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No
1,1002,Abed,Nadir,123-643-9775,93 West Main Street,No,Yes
2,1003,Walter,White,706-695-0392,298 Drugs Driveway,N,
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y
4,1005,Jon,Snow,876-678-3469,123 Dragons Road,Y,No


In [11]:
# splitting 'Address' column into 'Street_Address', 'State', and 'Zip_Code' columns
df[['Street_Address', 'State', 'Zip_Code']] = df['Address'].str.split(', ', expand = True) # expand = True splits the results into separate columns 
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Street_Address,State,Zip_Code
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,123 Shire Lane,Shire,
1,1002,Abed,Nadir,123-643-9775,93 West Main Street,No,Yes,93 West Main Street,,
2,1003,Walter,White,706-695-0392,298 Drugs Driveway,N,,298 Drugs Driveway,,
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,980 Paper Avenue,Pennsylvania,18503.0
4,1005,Jon,Snow,876-678-3469,123 Dragons Road,Y,No,123 Dragons Road,,


In [12]:
# dropping address column
df = df.drop('Address', axis = 1)
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Paying Customer,Do_Not_Contact,Street_Address,State,Zip_Code
0,1001,Frodo,Baggins,123-545-5421,Yes,No,123 Shire Lane,Shire,
1,1002,Abed,Nadir,123-643-9775,No,Yes,93 West Main Street,,
2,1003,Walter,White,706-695-0392,N,,298 Drugs Driveway,,
3,1004,Dwight,Schrute,123-543-2345,Yes,Y,980 Paper Avenue,Pennsylvania,18503.0
4,1005,Jon,Snow,876-678-3469,Y,No,123 Dragons Road,,


In [13]:
# replacing 'N/a' with NaN and standardizing values in 'Paying Customer' and 'Do_Not_Contact' columns
df[['Paying Customer', 'Do_Not_Contact']] = df[['Paying Customer', 'Do_Not_Contact']].replace('N/a', np.nan)
df[['Paying Customer', 'Do_Not_Contact']] = df[['Paying Customer', 'Do_Not_Contact']].replace({'N', 'Y'}, {'No', 'Yes'})
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Paying Customer,Do_Not_Contact,Street_Address,State,Zip_Code
0,1001,Frodo,Baggins,123-545-5421,Yes,No,123 Shire Lane,Shire,
1,1002,Abed,Nadir,123-643-9775,No,Yes,93 West Main Street,,
2,1003,Walter,White,706-695-0392,No,,298 Drugs Driveway,,
3,1004,Dwight,Schrute,123-543-2345,Yes,Yes,980 Paper Avenue,Pennsylvania,18503.0
4,1005,Jon,Snow,876-678-3469,Yes,No,123 Dragons Road,,


In [14]:
# dropping records where customers have opted out of being contacted
df = df.drop(df[df['Do_Not_Contact'] == 'Yes'].index)
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Paying Customer,Do_Not_Contact,Street_Address,State,Zip_Code
0,1001,Frodo,Baggins,123-545-5421,Yes,No,123 Shire Lane,Shire,
2,1003,Walter,White,706-695-0392,No,,298 Drugs Driveway,,
4,1005,Jon,Snow,876-678-3469,Yes,No,123 Dragons Road,,
6,1007,Jeff,Winger,,No,No,1209 South Street,,
7,1008,Sherlock,Holmes,876-678-3469,No,No,98 Clue Drive,,


In [15]:
# dropping rows where 'Phone_Number' is missing
df = df.dropna(subset = ['Phone_Number'])
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Paying Customer,Do_Not_Contact,Street_Address,State,Zip_Code
0,1001,Frodo,Baggins,123-545-5421,Yes,No,123 Shire Lane,Shire,
2,1003,Walter,White,706-695-0392,No,,298 Drugs Driveway,,
4,1005,Jon,Snow,876-678-3469,Yes,No,123 Dragons Road,,
7,1008,Sherlock,Holmes,876-678-3469,No,No,98 Clue Drive,,
9,1010,Peter,Parker,123-545-5421,Yes,No,25th Main Street,New York,


In [16]:
# resetting the index after cleaning operations
df = df.reset_index(drop = True)
df.head()

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Paying Customer,Do_Not_Contact,Street_Address,State,Zip_Code
0,1001,Frodo,Baggins,123-545-5421,Yes,No,123 Shire Lane,Shire,
1,1003,Walter,White,706-695-0392,No,,298 Drugs Driveway,,
2,1005,Jon,Snow,876-678-3469,Yes,No,123 Dragons Road,,
3,1008,Sherlock,Holmes,876-678-3469,No,No,98 Clue Drive,,
4,1010,Peter,Parker,123-545-5421,Yes,No,25th Main Street,New York,
