In [114]:
import pandas as pd

## Extractions

In [115]:
travels_data = pd.read_csv('https://drive.google.com/uc?id=1muwnik-uFGTKBdHmcQN5z68rD7qmdG-b')

In [116]:
travels_data.shape #.shape returns the dimensionality of the data i.e 418 rows/entries and 6 columns 

(418, 6)

In [117]:
travels_data.info() #.info() returns information of the datatypes and structure of the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Employee               418 non-null    object 
 1   Destination            418 non-null    object 
 2   Travel Start Date      418 non-null    object 
 3   Travel End Date        418 non-null    object 
 4   Actual Total Expenses  418 non-null    float64
 5   Purpose Of Travel      415 non-null    object 
dtypes: float64(1), object(5)
memory usage: 19.7+ KB


In [118]:
# Things Identified in the dataset:

# Date is an object
# Replace null values in purpose of travel column
# Split States and City
# Seperate name into first and last name
# Seperate date values into day, month and year


In [119]:
travels_data

Unnamed: 0,Employee,Destination,Travel Start Date,Travel End Date,Actual Total Expenses,Purpose Of Travel
0,Aaron Salter,"OCEAN CITY, MD",2017-05-31,2017-06-02,644.10,Conference
1,Abigail Ratnofsky,"SAN DIEGO, CA",2016-01-24,2016-01-29,2962.70,Conference
2,Adam Kisthardt,"ORLANDO, FL",2018-10-04,2018-10-09,1891.19,Conference
3,Adam P. Jones,"OCEAN CITY, MD",2016-06-19,2016-06-21,324.36,Conference
4,Alan Butsch,"SAN DIEGO, CA",2019-11-19,2019-11-22,2141.51,Conference
...,...,...,...,...,...,...
413,Warren Jensen!,"OCEAN CITY, MD",2017-11-06,2017-11-09,502.45,Conference
414,Warp Jensen,"OCEAN CITY, MD",2019-03-25,2019-03-26,78.00,Conference
415,Whitney Kujawa,"OCEAN CITY, MD",2017-11-06,2017-11-09,567.00,Conference
416,William Kinna,"CHARLOTTE, NC",2017-05-21,2017-05-26,2163.67,Conference


In [120]:
travels_dataset = travels_data.copy() # This is to create backup copy of the dataset before performing transformations

In [121]:
travels_data.drop_duplicates()

Unnamed: 0,Employee,Destination,Travel Start Date,Travel End Date,Actual Total Expenses,Purpose Of Travel
0,Aaron Salter,"OCEAN CITY, MD",2017-05-31,2017-06-02,644.10,Conference
1,Abigail Ratnofsky,"SAN DIEGO, CA",2016-01-24,2016-01-29,2962.70,Conference
2,Adam Kisthardt,"ORLANDO, FL",2018-10-04,2018-10-09,1891.19,Conference
3,Adam P. Jones,"OCEAN CITY, MD",2016-06-19,2016-06-21,324.36,Conference
4,Alan Butsch,"SAN DIEGO, CA",2019-11-19,2019-11-22,2141.51,Conference
...,...,...,...,...,...,...
413,Warren Jensen!,"OCEAN CITY, MD",2017-11-06,2017-11-09,502.45,Conference
414,Warp Jensen,"OCEAN CITY, MD",2019-03-25,2019-03-26,78.00,Conference
415,Whitney Kujawa,"OCEAN CITY, MD",2017-11-06,2017-11-09,567.00,Conference
416,William Kinna,"CHARLOTTE, NC",2017-05-21,2017-05-26,2163.67,Conference


In [122]:
travels_data.shape

(418, 6)

## Cleaning

In [123]:
travels_data['Travel Start Date'] = pd.to_datetime(travels_data['Travel Start Date'])

In [124]:
travels_data['Travel Start Date'].info() #This converts the datatype to date

<class 'pandas.core.series.Series'>
RangeIndex: 418 entries, 0 to 417
Series name: Travel Start Date
Non-Null Count  Dtype         
--------------  -----         
418 non-null    datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 3.4 KB


In [125]:
def remove_white_space(arg): #This func is to remove white space from the string datatype 'Travel Start Date' before conversion if needed
    if type(arg) == str:
        return arg.strip()
    return arg


In [126]:
# remove_white_space(travels_data['Travel Start Date']) #A way to apply the func but below is a better way in pandas

In [127]:
travels_data['Travel Start Date'] = travels_data['Travel Start Date'].apply(remove_white_space)

In [128]:
travels_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Employee               418 non-null    object        
 1   Destination            418 non-null    object        
 2   Travel Start Date      418 non-null    datetime64[ns]
 3   Travel End Date        418 non-null    object        
 4   Actual Total Expenses  418 non-null    float64       
 5   Purpose Of Travel      415 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 19.7+ KB


In [129]:
#Repeating the process for Travel End Date
#First, apply the remove_white_space func should it exist in the data before converting to avoid error
travels_data['Travel End Date'] = travels_data['Travel End Date'].apply(remove_white_space)
travels_data['Travel End Date'] = pd.to_datetime(travels_data['Travel End Date'])

In [130]:
travels_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Employee               418 non-null    object        
 1   Destination            418 non-null    object        
 2   Travel Start Date      418 non-null    datetime64[ns]
 3   Travel End Date        418 non-null    datetime64[ns]
 4   Actual Total Expenses  418 non-null    float64       
 5   Purpose Of Travel      415 non-null    object        
dtypes: datetime64[ns](2), float64(1), object(3)
memory usage: 19.7+ KB


In [131]:
travels_data[travels_data['Purpose Of Travel'].isna()] #This filter out enteries with Na values

Unnamed: 0,Employee,Destination,Travel Start Date,Travel End Date,Actual Total Expenses,Purpose Of Travel
126,Gaila Compton,"OCEAN CITY, MD",2019-09-25,2019-09-27,690.0,
274,Mark Sheelor,"LAS VEGAS, NV",2017-03-06,2017-03-08,1411.34,
385,Tamara Maldonado,"OCEAN CITY, MD",2017-10-19,2017-10-20,73.5,


In [132]:
travels_data['Purpose Of Travel'] = travels_data['Purpose Of Travel'].fillna('Not Provided') #Replacing Na with Not Provided

In [133]:
travels_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Employee               418 non-null    object        
 1   Destination            418 non-null    object        
 2   Travel Start Date      418 non-null    datetime64[ns]
 3   Travel End Date        418 non-null    datetime64[ns]
 4   Actual Total Expenses  418 non-null    float64       
 5   Purpose Of Travel      418 non-null    object        
dtypes: datetime64[ns](2), float64(1), object(3)
memory usage: 19.7+ KB


In [134]:
# Creating funcs to split the destination column values into City and State.
# Added a func to capitalize the first letter of the City value.
def extract_state(arg):
    result = arg.split(',')[1]
    return result

def extract_city(arg):
    result = arg.split(',')[0]
    return result

def make_lower(arg):
    result = arg.title()
    return result

In [135]:
travels_data['State'] = travels_data['Destination'].apply(extract_state)

travels_data['City'] = travels_data['Destination'].apply(extract_city).apply(make_lower)

In [136]:
travels_data

Unnamed: 0,Employee,Destination,Travel Start Date,Travel End Date,Actual Total Expenses,Purpose Of Travel,State,City
0,Aaron Salter,"OCEAN CITY, MD",2017-05-31,2017-06-02,644.10,Conference,MD,Ocean City
1,Abigail Ratnofsky,"SAN DIEGO, CA",2016-01-24,2016-01-29,2962.70,Conference,CA,San Diego
2,Adam Kisthardt,"ORLANDO, FL",2018-10-04,2018-10-09,1891.19,Conference,FL,Orlando
3,Adam P. Jones,"OCEAN CITY, MD",2016-06-19,2016-06-21,324.36,Conference,MD,Ocean City
4,Alan Butsch,"SAN DIEGO, CA",2019-11-19,2019-11-22,2141.51,Conference,CA,San Diego
...,...,...,...,...,...,...,...,...
413,Warren Jensen!,"OCEAN CITY, MD",2017-11-06,2017-11-09,502.45,Conference,MD,Ocean City
414,Warp Jensen,"OCEAN CITY, MD",2019-03-25,2019-03-26,78.00,Conference,MD,Ocean City
415,Whitney Kujawa,"OCEAN CITY, MD",2017-11-06,2017-11-09,567.00,Conference,MD,Ocean City
416,William Kinna,"CHARLOTTE, NC",2017-05-21,2017-05-26,2163.67,Conference,NC,Charlotte


In [137]:
# Creating funcs to split the Employee column values into first name and last name.

def extract_firstname(arg):
    result = arg.split(' ')[0]
    return result

def extract_lastname(arg):
    result = arg.split(' ')[-1]
    return result

In [138]:

travels_data['firstname'] = travels_data['Employee'].apply(extract_firstname)

travels_data['lastname'] = travels_data['Employee'].apply(extract_lastname)

In [139]:
travels_data

Unnamed: 0,Employee,Destination,Travel Start Date,Travel End Date,Actual Total Expenses,Purpose Of Travel,State,City,firstname,lastname
0,Aaron Salter,"OCEAN CITY, MD",2017-05-31,2017-06-02,644.10,Conference,MD,Ocean City,Aaron,Salter
1,Abigail Ratnofsky,"SAN DIEGO, CA",2016-01-24,2016-01-29,2962.70,Conference,CA,San Diego,Abigail,Ratnofsky
2,Adam Kisthardt,"ORLANDO, FL",2018-10-04,2018-10-09,1891.19,Conference,FL,Orlando,Adam,Kisthardt
3,Adam P. Jones,"OCEAN CITY, MD",2016-06-19,2016-06-21,324.36,Conference,MD,Ocean City,Adam,Jones
4,Alan Butsch,"SAN DIEGO, CA",2019-11-19,2019-11-22,2141.51,Conference,CA,San Diego,Alan,Butsch
...,...,...,...,...,...,...,...,...,...,...
413,Warren Jensen!,"OCEAN CITY, MD",2017-11-06,2017-11-09,502.45,Conference,MD,Ocean City,Warren,Jensen!
414,Warp Jensen,"OCEAN CITY, MD",2019-03-25,2019-03-26,78.00,Conference,MD,Ocean City,Warp,Jensen
415,Whitney Kujawa,"OCEAN CITY, MD",2017-11-06,2017-11-09,567.00,Conference,MD,Ocean City,Whitney,Kujawa
416,William Kinna,"CHARLOTTE, NC",2017-05-21,2017-05-26,2163.67,Conference,NC,Charlotte,William,Kinna


In [140]:
travels_data = travels_data.drop(['Destination'])


KeyError: "['Destination'] not found in axis"

In [86]:
travels_data

<bound method Series.drop of 0      OCEAN CITY, MD
1       SAN DIEGO, CA
2         ORLANDO, FL
3      OCEAN CITY, MD
4       SAN DIEGO, CA
            ...      
413    OCEAN CITY, MD
414    OCEAN CITY, MD
415    OCEAN CITY, MD
416     CHARLOTTE, NC
417    OCEAN CITY, MD
Name: Destination, Length: 418, dtype: object>