# Data Types

## Converting data types

In [3]:
import pandas as pd

tips = pd.read_csv('tips.csv')

# Convert the sex column to type 'category'
tips.sex = tips.sex.astype('category')

# Convert the smoker column to type 'category'
tips.smoker = tips.smoker.astype('category')

# Print the info of tips
print(tips.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null category
smoker        244 non-null category
day           244 non-null object
time          244 non-null object
size          244 non-null int64
dtypes: category(2), float64(2), int64(1), object(2)
memory usage: 10.3+ KB
None


## Working with numeric data

In [4]:
# Convert 'total_bill' to a numeric dtype
tips['total_bill'] = pd.to_numeric(tips['total_bill'], errors='coerce')

# Convert 'tip' to a numeric dtype
tips['tip'] = pd.to_numeric(tips['tip'], errors='coerce')

# Print the info of tips
print(tips.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
total_bill    244 non-null float64
tip           244 non-null float64
sex           244 non-null category
smoker        244 non-null category
day           244 non-null object
time          244 non-null object
size          244 non-null int64
dtypes: category(2), float64(2), int64(1), object(2)
memory usage: 10.3+ KB
None


# Using regular expressions to clean strings

## String parsing with regular expressions


In [8]:
# Import the regular expression module
import re

# Compile the pattern: prog (match phone number 3-3-4 digits)
prog = re.compile('\d{3}-\d{3}-\d{4}')

# See if the pattern matches
result = prog.match('123-456-7890')
print(bool(result))

# See if the pattern matches
result2 = prog.match('1123-456-7890')
print(bool(result2))

True
False


## Extracting numerical values from strings


In [10]:
# Import the regular expression module
import re

# Find the numeric values: matches
matches = re.findall('\d+', 'the recipe calls for 10 strawberries and 1 banana')

# Print the matches
print(matches)

['10', '1']


## Pattern matching

In [17]:
# Write the first pattern
pattern1 = bool(re.match(pattern='\d{3}-\d{3}-\d{4}', string='123-456-7890'))
print(pattern1)

# Write the second pattern
pattern2 = bool(re.match(pattern='\$\d+\.\d{2}', string='$123.45'))
print(pattern2)

# Write the third pattern
pattern3 = bool(re.match(pattern='[A-Z]\w*', string='Australia'))
print(pattern3)

True
True
True


# Using functions to clean data

## Custom functions to clean data

Define a function named recode_gender() that has one parameter: gender.
If gender equals 'Male', return 1.
Else, if gender equals 'Female', return 0.
If gender does not equal 'Male' or 'Female', return np.nan. NumPy has been pre-imported for you.
Apply your recode_gender() function over tips.sex using the .apply() method to create a new column: 'recode'. Note that when passing in a function inside the .apply() method, you don't need to specify the parentheses after the function name.
Hit 'Submit Answer' and take note of the new 'gender_recode' column in the tips DataFrame!

In [27]:
tips = pd.read_csv('tips_nan.csv')

# Define recode_gender()
def recode_gender(gender):

    # Return 0 if gender is 'Female'
    if gender == 'Female':
        return 0
    
    # Return 1 if gender is 'Male'    
    elif gender == 'Male':
        return 1
    
    # Return np.nan    
    else:
        return np.nan

# Apply the function to the sex column
tips['recode'] = tips.sex.apply(recode_gender)

# Print the first five rows of tips
print(tips.head())

   total_bill   tip     sex smoker  day    time  size  recode
0       16.99  1.01  Female     No  Sun  Dinner   2.0     0.0
1       10.34  1.66     NaN     No  Sun  Dinner   3.0     NaN
2       21.01   NaN    Male     No  Sun  Dinner   3.0     1.0
3       23.68  3.31    Male     No  Sun  Dinner   2.0     1.0
4       24.59   NaN  Female     No  Sun  Dinner   4.0     0.0


## Lambda functions

- Use the .replace() method inside a lambda function to remove the dollar sign from the 'total_dollar' column of tips.
- You need to specify two arguments to the .replace() method: The string to be replaced ('$'), and the string to replace it by ('').
- Apply the lambda function over the 'total_dollar' column of tips.
- Use a regular expression to remove the dollar sign from the 'total_dollar' column of tips.
- The pattern has been provided for you: It is the first argument of the re.findall() function.
- Complete the rest of the lambda function and apply it over the 'total_dollar' column of tips. Notice that because - - re.findall() returns a list, you have to slice it in order to access the actual value.
- Hit 'Submit Answer' to verify that you have removed the dollar sign from the column.

In [38]:
tips = pd.read_csv('tips.csv')

tips['total_dollar'] = tips.total_bill.apply(lambda x: '$' + str(x))

tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_dollar
0,16.99,1.01,Female,No,Sun,Dinner,2,$16.99
1,10.34,1.66,Male,No,Sun,Dinner,3,$10.34
2,21.01,3.5,Male,No,Sun,Dinner,3,$21.01
3,23.68,3.31,Male,No,Sun,Dinner,2,$23.68
4,24.59,3.61,Female,No,Sun,Dinner,4,$24.59


In [40]:
# Write the lambda function using replace
tips['total_dollar_replace'] = tips.total_dollar.apply(lambda x: x.replace('$', ''))

# Write the lambda function using regular expressions
tips['total_dollar_re'] = tips.total_dollar.apply(lambda x: re.findall('\d+\.\d+', x)[0])

# Print the head of tips
print(tips.head())

tips.info()

   total_bill   tip     sex smoker  day    time  size total_dollar  \
0       16.99  1.01  Female     No  Sun  Dinner     2       $16.99   
1       10.34  1.66    Male     No  Sun  Dinner     3       $10.34   
2       21.01  3.50    Male     No  Sun  Dinner     3       $21.01   
3       23.68  3.31    Male     No  Sun  Dinner     2       $23.68   
4       24.59  3.61  Female     No  Sun  Dinner     4       $24.59   

  total_dollar_replace total_dollar_re  
0                16.99           16.99  
1                10.34           10.34  
2                21.01           21.01  
3                23.68           23.68  
4                24.59           24.59  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 10 columns):
total_bill              244 non-null float64
tip                     244 non-null float64
sex                     244 non-null object
smoker                  244 non-null object
day                     244 non-null object
time  

# Duplicate and missing data

## Dropping duplicate data

- Create a new DataFrame called tracks that contains the following columns from billboard: 'year', 'artist', 'track', and 'time'.
- Print the info of tracks. This has been done for you.
- Drop duplicate rows from tracks using the .drop_duplicates() method. Save the result to tracks_no_duplicates.
- Print the info of tracks_no_duplicates. This has been done for you, so hit 'Submit Answer' to see the results!

In [46]:
billboard = pd.read_csv('billboard.csv')

# Create the new DataFrame: tracks
tracks = billboard[['year', 'artist', 'track', 'time']]

# Print info of tracks
print(tracks.info())

# Drop the duplicates: tracks_no_duplicates
tracks_no_duplicates = tracks.drop_duplicates()

# Print info of tracks
print(tracks_no_duplicates.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 317 entries, 0 to 316
Data columns (total 4 columns):
year      317 non-null int64
artist    317 non-null object
track     317 non-null object
time      317 non-null object
dtypes: int64(1), object(3)
memory usage: 10.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 317 entries, 0 to 316
Data columns (total 4 columns):
year      317 non-null int64
artist    317 non-null object
track     317 non-null object
time      317 non-null object
dtypes: int64(1), object(3)
memory usage: 12.4+ KB


## Filling missing data

- Calculate the mean of the Ozone column of airquality using the .mean() method on airquality.Ozone.
- Use the .fillna() method to replace all the missing values in the Ozone column of airquality with the mean, oz_mean.
- Hit 'Submit Answer' to see the result of filling in the missing values!

In [52]:
import numpy as np

airquality = pd.read_csv('airquality.csv')

airquality.head()

Unnamed: 0,Ozone,Solar.R,Wind,Temp,Month,Day
0,41.0,190.0,7.4,67,5,1
1,36.0,118.0,8.0,72,5,2
2,12.0,149.0,12.6,74,5,3
3,18.0,313.0,11.5,62,5,4
4,,,14.3,56,5,5


In [59]:
# Calculate the mean of the Ozone column: oz_mean
oz_mean = airquality['Ozone'].mean()

# Replace all the missing values in the Ozone column with the mean
airquality['Ozone'] = airquality.Ozone.fillna(oz_mean)

# Print the info of airquality
print(airquality.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 153 entries, 0 to 152
Data columns (total 6 columns):
Ozone      153 non-null float64
Solar.R    146 non-null float64
Wind       153 non-null float64
Temp       153 non-null int64
Month      153 non-null int64
Day        153 non-null int64
dtypes: float64(3), int64(3)
memory usage: 7.2 KB
None


# Testing with asserts

## Testing your data with asserts

- Write an assert statement to confirm that there are no missing values in ebola.
    - Use the pd.notnull() function on ebola (or the .notnull() method of ebola) and chain two .all() methods (that is, .all().all()). The first .all() method will return a True or False for each column, while the second .all() method will return a single True or False.
- Write an assert statement to confirm that all values in ebola are greater than or equal to 0.
    - Chain two all() methods to the Boolean condition (ebola >= 0).

In [66]:
ebola = pd.read_csv('ebola.csv')

ebola.info()

# Assert that there are no missing values
assert pd.notnull(ebola).all().all()

# Assert that all values are >= 0
assert (ebola >= 0).all().all()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 18 columns):
Date                   122 non-null object
Day                    122 non-null int64
Cases_Guinea           93 non-null float64
Cases_Liberia          83 non-null float64
Cases_SierraLeone      87 non-null float64
Cases_Nigeria          38 non-null float64
Cases_Senegal          25 non-null float64
Cases_UnitedStates     18 non-null float64
Cases_Spain            16 non-null float64
Cases_Mali             12 non-null float64
Deaths_Guinea          92 non-null float64
Deaths_Liberia         81 non-null float64
Deaths_SierraLeone     87 non-null float64
Deaths_Nigeria         38 non-null float64
Deaths_Senegal         22 non-null float64
Deaths_UnitedStates    18 non-null float64
Deaths_Spain           16 non-null float64
Deaths_Mali            12 non-null float64
dtypes: float64(16), int64(1), object(1)
memory usage: 17.2+ KB


AssertionError: 