## Requirement: Provide clean data for only for customers who can be contacted.

In [77]:
import pandas as pd

### importing and reading dataset

In [78]:
df = pd.read_excel(r'Customer Call List.xlsx')
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,True
6,1007,Jeff,Winger,,1209 South Street,No,No,False
7,1008,Sherlock,Holmes,876|678|3469,98 Clue Drive,N,No,False
8,1009,Gandalf,,N/a,123 Middle Earth,Yes,,False
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,True


### Data Cleaning:

#### finding duplicates and dropping duplicates

In [79]:
df.duplicated(keep='first').sum()

1

In [80]:
df[df.duplicated(keep=False)]

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
19,1020,Anakin,Skywalker,876|678|3469,"910 Tatooine Road, Tatooine",Yes,N,True
20,1020,Anakin,Skywalker,876|678|3469,"910 Tatooine Road, Tatooine",Yes,N,True


In [81]:
df.drop_duplicates(keep='first', inplace=True)

In [82]:
df.duplicated().sum()

0

#### dropping redundant columns

In [83]:
df.drop(columns = 'Not_Useful_Column', inplace = True)

In [84]:
df.columns

Index(['CustomerID', 'First_Name', 'Last_Name', 'Phone_Number', 'Address',
       'Paying Customer', 'Do_Not_Contact'],
      dtype='object')

#### cleaning Last_Name column

In [85]:
df['Last_Name'] = df['Last_Name'].str.strip('./_')

In [86]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes
2,1003,Walter,White,7066950392,298 Drugs Driveway,N,
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes
6,1007,Jeff,Winger,,1209 South Street,No,No
7,1008,Sherlock,Holmes,876|678|3469,98 Clue Drive,N,No
8,1009,Gandalf,,N/a,123 Middle Earth,Yes,
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No


#### cleaning and standardizing Phone_Number column

In [87]:
df['Phone_Number'] = df['Phone_Number'].str.replace(r'\W', '', regex=True)

In [88]:
df['Phone_Number']

0     1235455421
1     1236439775
2            NaN
3     1235432345
4     8766783469
5     3047622467
6            NaN
7     8766783469
8             Na
9     1235455421
10           NaN
11           NaN
12    1235432345
13    8766783469
14    3047622467
15    1235455421
16    1236439775
17           NaN
18            Na
19    8766783469
Name: Phone_Number, dtype: object

In [89]:
df['Phone_Number'] = df['Phone_Number'].astype(str)

In [90]:
df['Phone_Number'] = df['Phone_Number'].apply(lambda x: x[0:3] + '-' + x[3:6] + '-' + x[6:10])

In [91]:
df['Phone_Number']

0     123-545-5421
1     123-643-9775
2            nan--
3     123-543-2345
4     876-678-3469
5     304-762-2467
6            nan--
7     876-678-3469
8             Na--
9     123-545-5421
10           nan--
11           nan--
12    123-543-2345
13    876-678-3469
14    304-762-2467
15    123-545-5421
16    123-643-9775
17           nan--
18            Na--
19    876-678-3469
Name: Phone_Number, dtype: object

In [92]:
df['Phone_Number'] = df['Phone_Number'].str.replace('nan--', '')
df['Phone_Number'] = df['Phone_Number'].str.replace('Na--', '')

In [93]:
df['Phone_Number']

0     123-545-5421
1     123-643-9775
2                 
3     123-543-2345
4     876-678-3469
5     304-762-2467
6                 
7     876-678-3469
8                 
9     123-545-5421
10                
11                
12    123-543-2345
13    876-678-3469
14    304-762-2467
15    123-545-5421
16    123-643-9775
17                
18                
19    876-678-3469
Name: Phone_Number, dtype: object

#### splitting Address column

In [94]:
df[ ['Street_Address', 'State', 'Zipcode'] ] = df['Address'].str.split(',', expand=True)

In [95]:
df.drop(columns = 'Address', inplace=True)

In [96]:
df.columns

Index(['CustomerID', 'First_Name', 'Last_Name', 'Phone_Number',
       'Paying Customer', 'Do_Not_Contact', 'Street_Address', 'State',
       'Zipcode'],
      dtype='object')

In [97]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Paying Customer,Do_Not_Contact,Street_Address,State,Zipcode
0,1001,Frodo,Baggins,123-545-5421,Yes,No,123 Shire Lane,Shire,
1,1002,Abed,Nadir,123-643-9775,No,Yes,93 West Main Street,,
2,1003,Walter,White,,N,,298 Drugs Driveway,,
3,1004,Dwight,Schrute,123-543-2345,Yes,Y,980 Paper Avenue,Pennsylvania,18503.0
4,1005,Jon,Snow,876-678-3469,Y,No,123 Dragons Road,,
5,1006,Ron,Swanson,304-762-2467,Yes,Yes,768 City Parkway,,
6,1007,Jeff,Winger,,No,No,1209 South Street,,
7,1008,Sherlock,Holmes,876-678-3469,N,No,98 Clue Drive,,
8,1009,Gandalf,,,Yes,,123 Middle Earth,,
9,1010,Peter,Parker,123-545-5421,Yes,No,25th Main Street,New York,


#### standardizing Paying Customer and Do_Not_Contact column

In [98]:
df['Paying Customer'] = df['Paying Customer'].replace(to_replace = 'Yes', value = 'Y')
df['Paying Customer'] = df['Paying Customer'].replace(to_replace = 'No', value = 'N')

In [99]:
df['Paying Customer']

0       Y
1       N
2       N
3       Y
4       Y
5       Y
6       N
7       N
8       Y
9       Y
10      Y
11      Y
12      Y
13      Y
14      N
15      N
16      Y
17      Y
18    N/a
19      Y
Name: Paying Customer, dtype: object

In [100]:
df['Do_Not_Contact'] = df['Do_Not_Contact'].replace(to_replace = 'Yes', value = 'Y')
df['Do_Not_Contact'] = df['Do_Not_Contact'].replace(to_replace = 'No', value = 'N')

In [101]:
df['Do_Not_Contact']

0       N
1       Y
2     NaN
3       Y
4       N
5       Y
6       N
7       N
8     NaN
9       N
10      N
11    NaN
12      N
13      N
14      N
15      N
16      N
17    NaN
18      Y
19      N
Name: Do_Not_Contact, dtype: object

In [102]:
df.rename(columns = {'Paying Customer' : 'Paying_Customer'}, inplace=True)

In [103]:
df.columns

Index(['CustomerID', 'First_Name', 'Last_Name', 'Phone_Number',
       'Paying_Customer', 'Do_Not_Contact', 'Street_Address', 'State',
       'Zipcode'],
      dtype='object')

#### finding and handling null values

In [104]:
df.isnull().sum()

CustomerID          0
First_Name          0
Last_Name           1
Phone_Number        0
Paying_Customer     0
Do_Not_Contact      4
Street_Address      0
State              14
Zipcode            19
dtype: int64

In [105]:
df = df.fillna('')

In [106]:
df.isnull().sum()

CustomerID         0
First_Name         0
Last_Name          0
Phone_Number       0
Paying_Customer    0
Do_Not_Contact     0
Street_Address     0
State              0
Zipcode            0
dtype: int64

In [107]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Paying_Customer,Do_Not_Contact,Street_Address,State,Zipcode
0,1001,Frodo,Baggins,123-545-5421,Y,N,123 Shire Lane,Shire,
1,1002,Abed,Nadir,123-643-9775,N,Y,93 West Main Street,,
2,1003,Walter,White,,N,,298 Drugs Driveway,,
3,1004,Dwight,Schrute,123-543-2345,Y,Y,980 Paper Avenue,Pennsylvania,18503.0
4,1005,Jon,Snow,876-678-3469,Y,N,123 Dragons Road,,
5,1006,Ron,Swanson,304-762-2467,Y,Y,768 City Parkway,,
6,1007,Jeff,Winger,,N,N,1209 South Street,,
7,1008,Sherlock,Holmes,876-678-3469,N,N,98 Clue Drive,,
8,1009,Gandalf,,,Y,,123 Middle Earth,,
9,1010,Peter,Parker,123-545-5421,Y,N,25th Main Street,New York,


#### filtering data as per requirement

In [111]:
#missing values in Do_Not_Contact column are going to be assumed and filled with a 'N'
df['Do_Not_Contact'] = df['Do_Not_Contact'].replace(to_replace = '', value = 'N')

In [112]:
df['Do_Not_Contact']

0     N
1     Y
2     N
3     Y
4     N
5     Y
6     N
7     N
8     N
9     N
10    N
11    N
12    N
13    N
14    N
15    N
16    N
17    N
18    Y
19    N
Name: Do_Not_Contact, dtype: object

In [113]:
for x in df.index:
    if df.loc[x, 'Do_Not_Contact'] == 'Y':
        df.drop(x, inplace=True)

In [114]:
df['Do_Not_Contact']

0     N
2     N
4     N
6     N
7     N
8     N
9     N
10    N
11    N
12    N
13    N
14    N
15    N
16    N
17    N
19    N
Name: Do_Not_Contact, dtype: object

In [122]:
#dropping customers with no Phone_Number as they cannot be contacted
import numpy as np
df['Phone_Number'] = df['Phone_Number'].replace('', np.nan)
df.dropna(subset = ['Phone_Number'], inplace=True)

In [123]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Paying_Customer,Do_Not_Contact,Street_Address,State,Zipcode
0,1001,Frodo,Baggins,123-545-5421,Y,N,123 Shire Lane,Shire,
4,1005,Jon,Snow,876-678-3469,Y,N,123 Dragons Road,,
7,1008,Sherlock,Holmes,876-678-3469,N,N,98 Clue Drive,,
9,1010,Peter,Parker,123-545-5421,Y,N,25th Main Street,New York,
12,1013,Don,Draper,123-543-2345,Y,N,2039 Main Street,,
13,1014,Leslie,Knope,876-678-3469,Y,N,343 City Parkway,,
14,1015,Toby,Flenderson,304-762-2467,N,N,214 HR Avenue,,
15,1016,Ron,Weasley,123-545-5421,N,N,2395 Hogwarts Avenue,,
16,1017,Michael,Scott,123-643-9775,Y,N,121 Paper Avenue,Pennsylvania,
19,1020,Anakin,Skywalker,876-678-3469,Y,N,910 Tatooine Road,Tatooine,


#### resetting index

In [124]:
df = df.reset_index(drop = True)

In [125]:
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Paying_Customer,Do_Not_Contact,Street_Address,State,Zipcode
0,1001,Frodo,Baggins,123-545-5421,Y,N,123 Shire Lane,Shire,
1,1005,Jon,Snow,876-678-3469,Y,N,123 Dragons Road,,
2,1008,Sherlock,Holmes,876-678-3469,N,N,98 Clue Drive,,
3,1010,Peter,Parker,123-545-5421,Y,N,25th Main Street,New York,
4,1013,Don,Draper,123-543-2345,Y,N,2039 Main Street,,
5,1014,Leslie,Knope,876-678-3469,Y,N,343 City Parkway,,
6,1015,Toby,Flenderson,304-762-2467,N,N,214 HR Avenue,,
7,1016,Ron,Weasley,123-545-5421,N,N,2395 Hogwarts Avenue,,
8,1017,Michael,Scott,123-643-9775,Y,N,121 Paper Avenue,Pennsylvania,
9,1020,Anakin,Skywalker,876-678-3469,Y,N,910 Tatooine Road,Tatooine,


#### exporting the cleaned dataframe as excel worksheet

In [126]:
df.to_excel('Cleaned Customer Call List.xlsx', index = True)