# 数据清洗模板

## 收集

In [None]:
import pandas as pd
import zipfile

In [None]:
# 从压缩文件中提取所有内容
with zipfile.ZipFile('armenian-online-job-postings.zip', 'r') as myzip:
    myzip.extractall()

In [None]:
# 读取逗号分隔文件到 DataFrame
df = pd.read_csv('online-job-postings.csv')

## 评估

In [None]:
df

In [None]:
df.info()

- 缺失值 (非数字)
- 起始日期不一致 (ASAP)
- 修改非描述列标题 (ApplicationP、AboutC、RequiredQual 以及 JobRequirment)

## 清理
#### 定义
- 选择所有非描述且拼写错误的列标题 (ApplicationP、AboutC、RequiredQual、JobRequirment) 并替换为完整单词 (ApplicationProcedure、AboutCompany、RequiredQualifications、JobRequirement)
- 选择起始日期列中含有 "As soon as possible"、"Immediately" 等词的所有记录，并将这些内容替换为 "ASAP"

#### 代码

In [None]:
df_clean = df.copy()

- 选择所有非描述且拼写错误的列标题 (ApplicationP、AboutC、RequiredQual、JobRequirment) 并替换为完整单词 (ApplicationProcedure、AboutCompany、RequiredQualifications、JobRequirement)

In [None]:
df_clean = df_clean.rename(columns={'ApplicationP': 'ApplicationProcedure',
                                    'AboutC': 'AboutCompany',
                                    'RequiredQual': 'RequiredQualifications',
                                    'JobRequirment': 'JobRequirements'})

- 选择起始日期列中含有 "As soon as possible"、"Immediately" 等词的所有记录，并将这些内容替换为 "ASAP"

In [None]:
asap_list = ['Immediately', 'As soon as possible', 'Upon hiring',
             'Immediate', 'Immediate employment', 'As soon as possible.', 'Immediate job opportunity',
             '"Immediate employment, after passing the interview."',
             'ASAP preferred', 'Employment contract signature date',
             'Immediate employment opportunity', 'Immidiately', 'ASA',
             'Asap', '"The position is open immediately but has a flexible start date depending on the candidates earliest availability."',
             'Immediately upon agreement', '20 November 2014 or ASAP',
             'immediately', 'Immediatelly',
             '"Immediately upon selection or no later than November 15, 2009."',
             'Immediate job opening', 'Immediate hiring', 'Upon selection',
             'As soon as practical', 'Immadiate', 'As soon as posible',
             'Immediately with 2 months probation period',
             '12 November 2012 or ASAP', 'Immediate employment after passing the interview',
             'Immediately/ upon agreement', '01 September 2014 or ASAP',
             'Immediately or as per agreement', 'as soon as possible',
             'As soon as Possible', 'in the nearest future', 'immediate',
             '01 April 2014 or ASAP', 'Immidiatly', 'Urgent',
             'Immediate or earliest possible', 'Immediate hire',
             'Earliest  possible', 'ASAP with 3 months probation period.',
             'Immediate employment opportunity.', 'Immediate employment.',
             'Immidietly', 'Imminent', 'September 2014 or ASAP', 'Imediately']

for phrase in asap_list:
    df_clean.StartDate.replace(phrase, 'ASAP', inplace=True)

#### 测试

In [None]:
df_clean.info()

In [None]:
df_clean.StartDate.value_counts()

In [None]:
for phrase in asap_list:
    assert phrase not in df_clean["StartDate"].values