# Data Wrangling Template

## Gather

In [1]:
import zipfile
import pandas as pd

In [2]:
# Extrat all contents from zip file
with zipfile.ZipFile('armenian-online-job-postings.zip','r') as myzip:
    myzip.extractall()


## Assess

In [3]:
#Read CSV file into a dataframe
df = pd.read_csv('online-job-postings.csv')

In [4]:
df.head(10)

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\r\nPOSITION,...,,Please send resume or CV toursula.kazarian@......,,20 January 2004\r\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\r\...,,2004,1,False
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,...,,Please send cover letter and resume to Amy\r\n...,,23 January 2004\r\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"20 January 2004, 18:00",,,,2004,1,True
5,"Boutique ""Appollo""\r\nJOB TITLE: Saleswoman\r...","Jan 10, 2004",Saleswoman,"Boutique ""Appollo""",,,,,,,...,,"For further information, please contact Irina\...",,01 February 2004,,,,2004,1,False
6,OSI Assistance Foundation - Armenian Branch Of...,"Jan 11, 2004",Chief Accountant/ Finance Assistant,OSI Assistance Foundation - Armenian Branch Of...,,,,,,,...,,"For submission of applications/ CVs, please\r\...",,"16 January 2004, 6:00 pm.",,,,2004,1,False
7,International Research & Exchanges Board (IREX...,"Jan 13, 2004",Non-paid part or full time Programmatic Intern,International Research & Exchanges Board (IREX),,,,,,6 months,...,,"To apply, please download and submit the\r\nap...",,16 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
8,Yerevan Brandy Company \r\nJOB TITLE: Assista...,"Jan 13, 2004",Assistant to Managing Director,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"27 January 2004, 18:00",,,,2004,1,False
9,American Embassy Yerevan\r\nANNOUNCEMENT NUMBE...,"Jan 13, 2004","Program Assistant (INL), FSN-8; FP-6*",American Embassy Yerevan\r\nANNOUNCEMENT NUMBE...,,,,,,,...,,Interested candidates for this position should...,,26 January 2004 \r\nDrafted: GSargsyan\r\nC...,,,,2004,1,False


<h2>Issues Noted</h2>
<ul>
    <li>Missing values NaN</li>
    <li>inconsistent entries for 'Start Date' [ASAP, As soon as possible, immediatley]</li>
    <li>Non-descriptive column headers</li>
</ul>


In [5]:
#Assessing data to look for missing values
# invalid data, inconsistent data, inaccurate data
# Tidy data: each variable forms a column, each observation forms a row
# and each observational unit forms a table
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19001 entries, 0 to 19000
Data columns (total 24 columns):
jobpost             19001 non-null object
date                19001 non-null object
Title               18973 non-null object
Company             18994 non-null object
AnnouncementCode    1208 non-null object
Term                7676 non-null object
Eligibility         4930 non-null object
Audience            640 non-null object
StartDate           9675 non-null object
Duration            10798 non-null object
Location            18969 non-null object
JobDescription      15109 non-null object
JobRequirment       16479 non-null object
RequiredQual        18517 non-null object
Salary              9622 non-null object
ApplicationP        18941 non-null object
OpeningDate         18295 non-null object
Deadline            18936 non-null object
Notes               2211 non-null object
AboutC              12470 non-null object
Attach              1559 non-null object
Year              

In [None]:
#Fix nondescriptive column header names [ApplicationP, AboutC, IT, AnnouncementCode]

In [27]:
df['StartDate'].value_counts()

ASAP                              4754
Immediately                        773
As soon as possible                543
Upon hiring                        261
Immediate                          259
                                  ... 
Approximately 06 February 2012       1
15 February 2012                     1
14 October 2005, 13:00 p.m.          1
End of June                          1
11 October 2010                      1
Name: StartDate, Length: 1186, dtype: int64

## Clean

#### Define

<ul>
    <li>Replace inconsistent 'StartDate' values with 'ASAP'</li>
    <li>Fix non-descriptive column headers</li>
</ul>

#### Code

1. Fix non-descriptive column headers

In [14]:
dfClean = df.copy()

col = {
    'ApplicationP':'ApplicationProcedure',
    'AboutC':'AboutCompany',
    'RequiredQual':'RequiredQualifications',
    'JobRequirment':'JobRequirement'
}
dfClean = dfClean.rename(columns = col)



#### Test

In [26]:
dfClean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19001 entries, 0 to 19000
Data columns (total 24 columns):
jobpost                   19001 non-null object
date                      19001 non-null object
Title                     18973 non-null object
Company                   18994 non-null object
AnnouncementCode          1208 non-null object
Term                      7676 non-null object
Eligibility               4930 non-null object
Audience                  640 non-null object
StartDate                 9675 non-null object
Duration                  10798 non-null object
Location                  18969 non-null object
JobDescription            15109 non-null object
JobRequirement            16479 non-null object
RequiredQualifications    18517 non-null object
Salary                    9622 non-null object
ApplicationProcedure      18941 non-null object
OpeningDate               18295 non-null object
Deadline                  18936 non-null object
Notes                     2211 non

#### Code
2. Replace inconsistent 'StartDate' values with 'ASAP'

In [27]:
asap_list = ['Immediately', 'As soon as possible', 'Upon hiring',
             'Immediate', 'Immediate employment', 'As soon as possible.', 'Immediate job opportunity',
             '"Immediate employment, after passing the interview."',
             'ASAP preferred', 'Employment contract signature date',
             'Immediate employment opportunity', 'Immidiately', 'ASA',
             'Asap', '"The position is open immediately but has a flexible start date depending on the candidates earliest availability."',
             'Immediately upon agreement', '20 November 2014 or ASAP',
             'immediately', 'Immediatelly',
             '"Immediately upon selection or no later than November 15, 2009."',
             'Immediate job opening', 'Immediate hiring', 'Upon selection',
             'As soon as practical', 'Immadiate', 'As soon as posible',
             'Immediately with 2 months probation period',
             '12 November 2012 or ASAP', 'Immediate employment after passing the interview',
             'Immediately/ upon agreement', '01 September 2014 or ASAP',
             'Immediately or as per agreement', 'as soon as possible',
             'As soon as Possible', 'in the nearest future', 'immediate',
             '01 April 2014 or ASAP', 'Immidiatly', 'Urgent',
             'Immediate or earliest possible', 'Immediate hire',
             'Earliest  possible', 'ASAP with 3 months probation period.',
             'Immediate employment opportunity.', 'Immediate employment.',
             'Immidietly', 'Imminent', 'September 2014 or ASAP', 'Imediately']

#dfClean = dfClean.StartDate.replace(to_replace = asap_list, value='ASAP')

##Using a for loop...
for li in asap_list:
    dfClean.StartDate.replace(to_replace = li, value='ASAP',inplace = True)

### Test

In [28]:
dfClean['StartDate'].value_counts()

ASAP                       6856
01 September 2012            31
March 2006                   27
November 2006                22
January 2010                 19
                           ... 
Beginning of March 2006       1
27 February 2012              1
08 October 2012               1
21 December 2004              1
23 July 2012                  1
Name: StartDate, Length: 1140, dtype: int64

In [33]:
startDates = pd.Series(dfClean['StartDate'])
for p in asap_list:
    assert p not in startDates
    #or
    assert p not in dfClean.StartDate.values
    