Importing a csv file into pandas dataframe

In [2]:

import pandas as pd
file='customer_data_mini.csv'
fileread=pd.read_csv(file, 
                     sep=',',
                     header=0, 
                     encoding=None,
                     parse_dates=True,
                     tupleize_cols=False,
                     error_bad_lines=False, 
                     warn_bad_lines=False,
                     skip_blank_lines=True
                )
fileread.head()

Unnamed: 0,birth date,customer loyalty level,first name,last name,ssn,street_address,city,state,postcode,company,job,work_phone,work_street_address,work_city,work_state,work_postcode,marketing_score
0,2/15/54,not at all,cole,o'brien,6439,51585 Kertzmann Common Apt. 186,Landanburgh,New Jersey,85110,"Hand, Schaden and Skiles",English as a foreign language teacher,1-162-398-4593x071,811 Oberbrunner Manors Suite 489,Port Fatefort,Hawaii,85170-9403,67.0
1,5/7/58,moderate,lise,heidenreich,689 24 9939,176 Larson Plains,Rogahnhaven,Georgia,85680,"Rempel, Rutherford and Swift",Tax adviser,1-050-479-1235x5911,960 Margene Point Apt. 319,West Shirltown,Louisiana,83296,
2,19XX-10-23,moderate,zilpha,skiles (FKA wonderman,,37254 Lubowitz Radial Apt. 240,New Vernita,Florida,13914,Conroy-Nicolas,Research scientist (life sciences),1-600-545-7463x08047,2409 Michelle Neck,Jadynmouth,Alaska,67050-2117,19.0
3,01/26/0056,highly loyal,damion,",wisozk",992245832,205 Hahn Stream,Port Jobeshire,Connecticut,4915,"Swaniawski, Bins and Stanton","Optician, dispensing",270.519.3236,076 Amil Wells Suite 336,Ernserhaven,Wyoming,57206-3600,89.0


Use Pandas to rename column headers

In [3]:
fileread.columns

Index(['birth date', 'customer loyalty level', 'first name', 'last name',
       'ssn', 'street_address', 'city', 'state', 'postcode', 'company', 'job',
       'work_phone', 'work_street_address', 'work_city', 'work_state',
       'work_postcode', 'marketing_score'],
      dtype='object')

In [4]:
fileread.rename(columns={'birth date': 'date_of_birth',
                         'customer loyalty level': 'customer_loyalty_level',
                         'first name': 'first_name',
                         'last name': 'last_name',
                         'ssn': 'social_security_number',
                         'postcode': 'zipcode',
                         'job': 'position'}, inplace=True)
fileread.columns

Index(['date_of_birth', 'customer_loyalty_level', 'first_name', 'last_name',
       'social_security_number', 'street_address', 'city', 'state', 'zipcode',
       'company', 'position', 'work_phone', 'work_street_address', 'work_city',
       'work_state', 'work_postcode', 'marketing_score'],
      dtype='object')

Filling in missing values in Pandas

In [5]:
# Replace all missing values a string - 'Missing'
fileread.fillna('Missing', inplace=True)
fileread

Unnamed: 0,date_of_birth,customer_loyalty_level,first_name,last_name,social_security_number,street_address,city,state,zipcode,company,position,work_phone,work_street_address,work_city,work_state,work_postcode,marketing_score
0,2/15/54,not at all,cole,o'brien,6439,51585 Kertzmann Common Apt. 186,Landanburgh,New Jersey,85110,"Hand, Schaden and Skiles",English as a foreign language teacher,1-162-398-4593x071,811 Oberbrunner Manors Suite 489,Port Fatefort,Hawaii,85170-9403,67
1,5/7/58,moderate,lise,heidenreich,689 24 9939,176 Larson Plains,Rogahnhaven,Georgia,85680,"Rempel, Rutherford and Swift",Tax adviser,1-050-479-1235x5911,960 Margene Point Apt. 319,West Shirltown,Louisiana,83296,Missing
2,19XX-10-23,moderate,zilpha,skiles (FKA wonderman,Missing,37254 Lubowitz Radial Apt. 240,New Vernita,Florida,13914,Conroy-Nicolas,Research scientist (life sciences),1-600-545-7463x08047,2409 Michelle Neck,Jadynmouth,Alaska,67050-2117,19
3,01/26/0056,highly loyal,damion,",wisozk",992245832,205 Hahn Stream,Port Jobeshire,Connecticut,4915,"Swaniawski, Bins and Stanton","Optician, dispensing",270.519.3236,076 Amil Wells Suite 336,Ernserhaven,Wyoming,57206-3600,89


In [6]:
#again change the value back to null
fileread['marketing_score'].replace('Missing', 0,inplace=True)
fileread

Unnamed: 0,date_of_birth,customer_loyalty_level,first_name,last_name,social_security_number,street_address,city,state,zipcode,company,position,work_phone,work_street_address,work_city,work_state,work_postcode,marketing_score
0,2/15/54,not at all,cole,o'brien,6439,51585 Kertzmann Common Apt. 186,Landanburgh,New Jersey,85110,"Hand, Schaden and Skiles",English as a foreign language teacher,1-162-398-4593x071,811 Oberbrunner Manors Suite 489,Port Fatefort,Hawaii,85170-9403,67.0
1,5/7/58,moderate,lise,heidenreich,689 24 9939,176 Larson Plains,Rogahnhaven,Georgia,85680,"Rempel, Rutherford and Swift",Tax adviser,1-050-479-1235x5911,960 Margene Point Apt. 319,West Shirltown,Louisiana,83296,0.0
2,19XX-10-23,moderate,zilpha,skiles (FKA wonderman,Missing,37254 Lubowitz Radial Apt. 240,New Vernita,Florida,13914,Conroy-Nicolas,Research scientist (life sciences),1-600-545-7463x08047,2409 Michelle Neck,Jadynmouth,Alaska,67050-2117,19.0
3,01/26/0056,highly loyal,damion,",wisozk",992245832,205 Hahn Stream,Port Jobeshire,Connecticut,4915,"Swaniawski, Bins and Stanton","Optician, dispensing",270.519.3236,076 Amil Wells Suite 336,Ernserhaven,Wyoming,57206-3600,89.0


In [7]:
# Replace all missing values with a 0
fileread.fillna(0, inplace=True)
fileread

Unnamed: 0,date_of_birth,customer_loyalty_level,first_name,last_name,social_security_number,street_address,city,state,zipcode,company,position,work_phone,work_street_address,work_city,work_state,work_postcode,marketing_score
0,2/15/54,not at all,cole,o'brien,6439,51585 Kertzmann Common Apt. 186,Landanburgh,New Jersey,85110,"Hand, Schaden and Skiles",English as a foreign language teacher,1-162-398-4593x071,811 Oberbrunner Manors Suite 489,Port Fatefort,Hawaii,85170-9403,67.0
1,5/7/58,moderate,lise,heidenreich,689 24 9939,176 Larson Plains,Rogahnhaven,Georgia,85680,"Rempel, Rutherford and Swift",Tax adviser,1-050-479-1235x5911,960 Margene Point Apt. 319,West Shirltown,Louisiana,83296,0.0
2,19XX-10-23,moderate,zilpha,skiles (FKA wonderman,Missing,37254 Lubowitz Radial Apt. 240,New Vernita,Florida,13914,Conroy-Nicolas,Research scientist (life sciences),1-600-545-7463x08047,2409 Michelle Neck,Jadynmouth,Alaska,67050-2117,19.0
3,01/26/0056,highly loyal,damion,",wisozk",992245832,205 Hahn Stream,Port Jobeshire,Connecticut,4915,"Swaniawski, Bins and Stanton","Optician, dispensing",270.519.3236,076 Amil Wells Suite 336,Ernserhaven,Wyoming,57206-3600,89.0


In [8]:
# Common practice - replace all missing values with the mean of the dataframe
fileread.fillna(fileread.mean(), inplace=True)

Unnamed: 0,date_of_birth,customer_loyalty_level,first_name,last_name,social_security_number,street_address,city,state,zipcode,company,position,work_phone,work_street_address,work_city,work_state,work_postcode,marketing_score
0,2/15/54,not at all,cole,o'brien,6439,51585 Kertzmann Common Apt. 186,Landanburgh,New Jersey,85110,"Hand, Schaden and Skiles",English as a foreign language teacher,1-162-398-4593x071,811 Oberbrunner Manors Suite 489,Port Fatefort,Hawaii,85170-9403,67.0
1,5/7/58,moderate,lise,heidenreich,689 24 9939,176 Larson Plains,Rogahnhaven,Georgia,85680,"Rempel, Rutherford and Swift",Tax adviser,1-050-479-1235x5911,960 Margene Point Apt. 319,West Shirltown,Louisiana,83296,0.0
2,19XX-10-23,moderate,zilpha,skiles (FKA wonderman,Missing,37254 Lubowitz Radial Apt. 240,New Vernita,Florida,13914,Conroy-Nicolas,Research scientist (life sciences),1-600-545-7463x08047,2409 Michelle Neck,Jadynmouth,Alaska,67050-2117,19.0
3,01/26/0056,highly loyal,damion,",wisozk",992245832,205 Hahn Stream,Port Jobeshire,Connecticut,4915,"Swaniawski, Bins and Stanton","Optician, dispensing",270.519.3236,076 Amil Wells Suite 336,Ernserhaven,Wyoming,57206-3600,89.0


In [9]:
#replace all missing values in the marketing score column with the mean of the marketing score column
fileread['marketing_score'].fillna(fileread['marketing_score'].mean(), inplace=True)

In [10]:
fileread

Unnamed: 0,date_of_birth,customer_loyalty_level,first_name,last_name,social_security_number,street_address,city,state,zipcode,company,position,work_phone,work_street_address,work_city,work_state,work_postcode,marketing_score
0,2/15/54,not at all,cole,o'brien,6439,51585 Kertzmann Common Apt. 186,Landanburgh,New Jersey,85110,"Hand, Schaden and Skiles",English as a foreign language teacher,1-162-398-4593x071,811 Oberbrunner Manors Suite 489,Port Fatefort,Hawaii,85170-9403,67.0
1,5/7/58,moderate,lise,heidenreich,689 24 9939,176 Larson Plains,Rogahnhaven,Georgia,85680,"Rempel, Rutherford and Swift",Tax adviser,1-050-479-1235x5911,960 Margene Point Apt. 319,West Shirltown,Louisiana,83296,0.0
2,19XX-10-23,moderate,zilpha,skiles (FKA wonderman,Missing,37254 Lubowitz Radial Apt. 240,New Vernita,Florida,13914,Conroy-Nicolas,Research scientist (life sciences),1-600-545-7463x08047,2409 Michelle Neck,Jadynmouth,Alaska,67050-2117,19.0
3,01/26/0056,highly loyal,damion,",wisozk",992245832,205 Hahn Stream,Port Jobeshire,Connecticut,4915,"Swaniawski, Bins and Stanton","Optician, dispensing",270.519.3236,076 Amil Wells Suite 336,Ernserhaven,Wyoming,57206-3600,89.0


Removing punctuation, if any using Pandas

In [11]:
import string   # importing the string package

exclude=set(string.punctuation) 
#set function is to create a set to hold similar items
# here the set holds all the punctuation marks

def remove_punctuation(x):
    try:
        x=''.join(ch for ch in x if ch not in exclude)
    except:
        pass
    return x

fileread.last_name=fileread.last_name.apply(remove_punctuation)

fileread

Unnamed: 0,date_of_birth,customer_loyalty_level,first_name,last_name,social_security_number,street_address,city,state,zipcode,company,position,work_phone,work_street_address,work_city,work_state,work_postcode,marketing_score
0,2/15/54,not at all,cole,obrien,6439,51585 Kertzmann Common Apt. 186,Landanburgh,New Jersey,85110,"Hand, Schaden and Skiles",English as a foreign language teacher,1-162-398-4593x071,811 Oberbrunner Manors Suite 489,Port Fatefort,Hawaii,85170-9403,67.0
1,5/7/58,moderate,lise,heidenreich,689 24 9939,176 Larson Plains,Rogahnhaven,Georgia,85680,"Rempel, Rutherford and Swift",Tax adviser,1-050-479-1235x5911,960 Margene Point Apt. 319,West Shirltown,Louisiana,83296,0.0
2,19XX-10-23,moderate,zilpha,skiles FKA wonderman,Missing,37254 Lubowitz Radial Apt. 240,New Vernita,Florida,13914,Conroy-Nicolas,Research scientist (life sciences),1-600-545-7463x08047,2409 Michelle Neck,Jadynmouth,Alaska,67050-2117,19.0
3,01/26/0056,highly loyal,damion,wisozk,992245832,205 Hahn Stream,Port Jobeshire,Connecticut,4915,"Swaniawski, Bins and Stanton","Optician, dispensing",270.519.3236,076 Amil Wells Suite 336,Ernserhaven,Wyoming,57206-3600,89.0


Removing white spaces in Pandas

In [12]:
def remove_whitespace(x):
    """
    Helper function to remove any blank space from a string
    x: any string
    """
    try:
        x = "".join(x.split())
    except:
        pass
    return x

fileread.last_name = fileread.last_name.apply(remove_whitespace)
fileread

Unnamed: 0,date_of_birth,customer_loyalty_level,first_name,last_name,social_security_number,street_address,city,state,zipcode,company,position,work_phone,work_street_address,work_city,work_state,work_postcode,marketing_score
0,2/15/54,not at all,cole,obrien,6439,51585 Kertzmann Common Apt. 186,Landanburgh,New Jersey,85110,"Hand, Schaden and Skiles",English as a foreign language teacher,1-162-398-4593x071,811 Oberbrunner Manors Suite 489,Port Fatefort,Hawaii,85170-9403,67.0
1,5/7/58,moderate,lise,heidenreich,689 24 9939,176 Larson Plains,Rogahnhaven,Georgia,85680,"Rempel, Rutherford and Swift",Tax adviser,1-050-479-1235x5911,960 Margene Point Apt. 319,West Shirltown,Louisiana,83296,0.0
2,19XX-10-23,moderate,zilpha,skilesFKAwonderman,Missing,37254 Lubowitz Radial Apt. 240,New Vernita,Florida,13914,Conroy-Nicolas,Research scientist (life sciences),1-600-545-7463x08047,2409 Michelle Neck,Jadynmouth,Alaska,67050-2117,19.0
3,01/26/0056,highly loyal,damion,wisozk,992245832,205 Hahn Stream,Port Jobeshire,Connecticut,4915,"Swaniawski, Bins and Stanton","Optician, dispensing",270.519.3236,076 Amil Wells Suite 336,Ernserhaven,Wyoming,57206-3600,89.0


Removing a string from within a string using Pandas

In [16]:
def remove_internal_abbreviations(s, thing_to_replace, replacement_string):
    """
    Helper function to remove things such as 'F/K/A' from a string
    s: the string to replace a value in
    thing_to_replace: what you want to replace in the given string
    replacement_string: the string to use as a replacement
    """
    try:
        s = s.replace(thing_to_replace, replacement_string)
    except:
        pass
    return s

fileread['last_name'] = fileread.apply(lambda x: remove_internal_abbreviations(x['last_name'], "FKA", "-"), axis=1)
fileread

Unnamed: 0,date_of_birth,customer_loyalty_level,first_name,last_name,social_security_number,street_address,city,state,zipcode,company,position,work_phone,work_street_address,work_city,work_state,work_postcode,marketing_score
0,2/15/54,not at all,cole,obrien,6439,51585 Kertzmann Common Apt. 186,Landanburgh,New Jersey,85110,"Hand, Schaden and Skiles",English as a foreign language teacher,1-162-398-4593x071,811 Oberbrunner Manors Suite 489,Port Fatefort,Hawaii,85170-9403,67.0
1,5/7/58,moderate,lise,heidenreich,689 24 9939,176 Larson Plains,Rogahnhaven,Georgia,85680,"Rempel, Rutherford and Swift",Tax adviser,1-050-479-1235x5911,960 Margene Point Apt. 319,West Shirltown,Louisiana,83296,0.0
2,19XX-10-23,moderate,zilpha,skiles-wonderman,Missing,37254 Lubowitz Radial Apt. 240,New Vernita,Florida,13914,Conroy-Nicolas,Research scientist (life sciences),1-600-545-7463x08047,2409 Michelle Neck,Jadynmouth,Alaska,67050-2117,19.0
3,01/26/0056,highly loyal,damion,wisozk,992245832,205 Hahn Stream,Port Jobeshire,Connecticut,4915,"Swaniawski, Bins and Stanton","Optician, dispensing",270.519.3236,076 Amil Wells Suite 336,Ernserhaven,Wyoming,57206-3600,89.0
