# Data Cleaning & Extraction of Recalls from 2011-10-21 to 2022-08-24

In [1]:
import pandas as pd
import re

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

recalls_p = pd.read_csv('recalls-p-2022-2021.csv')
recalls_p.drop(['Unnamed: 0', 'link'], axis=1, inplace=True)
recalls_p.reset_index(inplace=True)
recalls_p['date'] = pd.to_datetime(recalls_p.date, format='%Y-%m-%d')

# recalls_p.head()

In [2]:
# recalls_p.date.dt.year.value_counts()

### Replacing Values by Slice

In [3]:
# Passing correct audiences from category and category from audience columns by index ref
slice_audience = recalls_p['category'][~recalls_p['category'].str.contains(r'Food')]
slice_category = recalls_p['audiences'][recalls_p['audiences'].str.contains(r'Food')]

recalls_p['audiences'].loc[recalls_p['audiences'].str.contains(r'Food', regex=True)] = slice_audience
recalls_p['category'].loc[~recalls_p['category'].str.contains(r'Food', regex=True)] = slice_category

# Passing correct recall class from audience_2 column to recall_class column by index and category from audience scolumns by index ref
slice_class = recalls_p['audience_2'][recalls_p['audience_2'].str.contains(r'[Cc]lass', na=False)]
recalls_p['recall_class'].loc[~recalls_p['recall_class'].str.contains(r'[Cc]lass', na=False)] = slice_class

# Passing correct issue from company column to issue column by index
slice_issue = recalls_p['company'][recalls_p['company'].str.contains(r'^Food', na=False)]
recalls_p['issue'].loc[recalls_p['company'].str.contains(r'^Food', na=False)] = slice_issue

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [4]:
recalls_p.drop(['audience_2'], axis=1, inplace=True)

### Filling null replacing values

In [5]:
recalls_p['audiences'] = recalls_p.audiences.fillna('General public')

recalls_p['issue'] = recalls_p['issue'].str.replace('Food - ', '')
recalls_p['category'] = recalls_p['category'].str.replace('Food - ', '')

### Regex Function to Extract Hazard, Product and Hazard Type

In [6]:
def extract_hazard(s: str):
    regex_1 = re.search(r'- (.*)$', s)
    regex_2 = re.search(r', (.*)$,', s)
    if regex_1:
        return regex_1.group(1).capitalize()
    elif regex_2:
        return regex_2.group(1).capitalize()
    else:
        return s

In [7]:
def extract_product(s: str):
    regex_1 = re.search(r'brands? (.*) recalled', s)
    regex_2 = re.search(r'certain (.*) recalled', s)
    regex_3 = re.search(r'brands? (.*) may be', s)
    regex_11 = re.search(r'(.*) recalled due', s)
    regex_4 = re.search(r'due (.*)$', s)
    regex_5 = re.search(r'(.*?) brand', s)
    regex_6 = re.search(r'contains? (.*)$', s)
    regex_7 = re.search(r'certain (.*)$', s)
    regex_8 = re.search(r'in (.*)$', s)
    regex_9 = re.search(r'(.*) manufactured by', s)
    regex_10 = re.search(r'(.*) (recalled)? by', s)
    regex_11 = re.search(r'(.*) recalled due', s)
    if regex_1:
        return regex_1.group(1).capitalize()
    elif regex_2:
        return regex_2.group(1).capitalize()
    elif regex_3:
        return regex_3.group(1).capitalize()
    elif regex_4:
        return regex_4.group(1).capitalize()
    elif regex_5:
        return regex_5.group(1).capitalize()
    elif regex_6:
        return regex_6.group(1).capitalize()
    elif regex_7:
        return regex_7.group(1).capitalize()
    elif regex_8:
        return regex_8.group(1).capitalize()
    elif regex_9:
        return regex_9.group(1).capitalize()
    elif regex_10:
        return regex_10.group(1).capitalize()
    elif regex_11:
        return regex_11.group(1).capitalize()    
    else:
        return np.nan

In [8]:
def extract_hazard_type(s: str):
    regex_1 = re.search(r'(.*?) -', s)
    regex_2 = re.search(r'(.*?),', s)
    if regex_1:
        return regex_1.group(1).capitalize()
    elif regex_2:
        return regex_2.group(1).capitalize()
    else:
        return s

In [9]:
def extract_distribution(s: str):
    regex_1 = re.search(r'may have been (.*)', s)
    regex_2 = re.search(r'sold in (.*)', s)
    regex_3 = re.search(r'sold (.*)', s)
    if regex_1:
        return regex_1.group(1).capitalize()
    elif regex_2:
        return regex_2.group(1).capitalize()
    elif regex_3:
        return regex_3.group(1).capitalize()    
    else:
        return s

In [10]:
recalls_p.head()

Unnamed: 0,index,title,recall_info,date,company,issue,audiences,category,recall_class
0,0,Mrakovic Fine Foods brand Natural Chicken Burger recalled due to undeclared egg,Mrakovic Fine Foods brand Natural Chicken Burger recalled due to undeclared egg. The recalled product has been sold in Ontario.,2022-08-24,Mrakovic Fine Foods,Allergen - Egg,Retail,Meat and poultry - Frozen,Class 2
1,1,"Life120 brand ""Ricotta Di Bufala Campana Dop"" recalled due to generic E. coli","Life120 brand ""Ricotta Di Bufala Campana Dop"" recalled due to generic E. coli. The recalled product has been sold in Quebec.",2022-08-24,Life120,Microbial Contamination - E. Coli - non-pathogenic,Retail,Dairy,Class 2
2,2,"Food Recall Warning (Allergen) – Mastro San Daniele brand Charcuterie Trio – Prosciutto Cotto, Capocollo, Mortadella recalled due to undeclared milk","The affected product is being recalled from the marketplace because it contains milk which is not declared on the label. The recalled product has been sold in British Columbia, Alberta, Ontario, New Brunswick, Nova Scotia, and may have been…",2022-08-19,Mastro San Daniele,Allergen - Milk,General public,Meat and poultry - Processed,Class 1
3,3,Food Recall Warning (Allergen) – Cache Cuisine brand Pro-licious Protein Pancake & Waffle Mix recalled due to improperly undeclared milk,The affected product is being recalled from the marketplace because it contains milk which is improperly declared on the label. The recalled product has been sold nationally.,2022-08-19,Cache Cuisine,Allergen - Milk,General public,Grain products,Class 2
4,4,Delizia brand Vegetarian Ham recalled due to undeclared gluten and wheat,"Delizia brand Vegetarian Ham recalled due to undeclared gluten and wheat. The recalled product has been sold in Alberta, British Columbia, Manitoba and Ontario.",2022-08-19,Delizia,Allergen - Gluten,Retail,,


### Extract Hazard, Product, Brand, Distribution and Hazard Type

In [11]:
recalls_p['title_lower'] = recalls_p['title'].str.lower()

# Extracting distribution from recall_info column using regex matching function
recalls_p['distribution'] = recalls_p['recall_info'].apply(extract_distribution)
recalls_p['distribution'] = recalls_p['distribution'].str.replace('.', '').str.replace('and', '').str.replace('online', '').str.strip()

# Create product column with extracted values from the title column
recalls_p['product'] = recalls_p['title'].apply(extract_product)

# Create brand column with extracted values from the title column
recalls_p['brand'] = recalls_p['title'].str.extract(r'(.*?) brand')


# Extracting hazard type from issue column using regex function
recalls_p['hazard_type'] = recalls_p['issue'].apply(extract_hazard_type)

# Extracting hazard from issue column using regex function
recalls_p['hazard'] = recalls_p['issue'].apply(extract_hazard)

  recalls_p['distribution'] = recalls_p['distribution'].str.replace('.', '').str.replace('and', '').str.replace('online', '').str.strip()


### Replace Values by Slice

In [12]:
recalls_p['distribution'].loc[recalls_p['distribution'].str.contains(r'Distributed', na=False)] = 'Nationally'
recalls_p['hazard'].loc[recalls_p['hazard'].str.contains(r'non-pathogenic|other pathogenic', na=False)] = 'E. coli'
recalls_p['hazard'].loc[recalls_p['hazard'].str.contains(r'Listeria', na=False)] = 'Listeria monocytogenes'
recalls_p['hazard'].loc[recalls_p['hazard'].str.contains(r'[Qq]uality', na=False)] = 'Quality/Spoilage'
recalls_p['audiences'].loc[recalls_p['audiences'].str.contains(r'Hotel', na=False)] = 'Hotel/Restaurant/Institutional'
recalls_p['hazard'] = recalls_p['hazard'].str.replace('seeds', '')
recalls_p['distribution'] = recalls_p['distribution'].str.replace(r'  ', ',', regex=True).str.strip().str.title()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [13]:
recalls_p.isnull().sum()

index            0
title            0
recall_info      0
date             0
company          0
issue            0
audiences        0
category        15
recall_class    17
title_lower      0
distribution     0
product          0
brand           24
hazard_type      0
hazard           0
dtype: int64

---

In [14]:
# Select columns to keep
recalls_p.rename(columns={'audiences': 'audience'}, inplace=True)

recalls_p_clean = recalls_p[['index', 'title', 'date', 'hazard',
                   'hazard_type','recall_class','audience',
                   'distribution','brand','company','product']]

In [15]:
recalls_p_clean.shape

(193, 11)

In [16]:
# recalls_p_clean.to_csv('recalls-clean-2022-1.csv')

## Reference

In [17]:
# recalls_p.distribution.value_counts()

In [18]:
# print(recalls_p.audiences.value_counts())

In [19]:
# recalls_p.hazard.value_counts()

In [20]:
# print(recalls_p.category.value_counts())

In [21]:
# print(recalls_p.recall_class.value_counts())