In [51]:
import requests
import numpy as np
import pandas as pd
import yaml
from kaggle import Kaggle
import zipfile
from io import BytesIO, StringIO
import re
import csv
from datetime import datetime
import collections

In [2]:
def load_settings():
    with open('config.yaml', 'r') as sf:
        settings = yaml.load(sf.read())
    return settings

In [3]:
# Download dataset process
DATA_URL = "https://www.kaggle.com/madhab/jobposts/downloads/data%20job%20posts.csv"
SETTINGS = load_settings()

In [5]:
kaggle = Kaggle(SETTINGS)
response = kaggle.get_data(DATA_URL)

Login to Kaggle
Getting dataset


In [6]:
archive_output = response.content
archive = zipfile.ZipFile(BytesIO(archive_output), 'r')

In [8]:
unicoded = unicode(archive.read('data job posts.csv'), 'utf-8')

In [9]:
sio = StringIO(unicoded, newline=None)
io_data = sio.readlines()

In [14]:
kaggle_data = kaggle.to_array(io_data)

Transform data from text to array


In [38]:
'''
Extract the following fields from the jobpost column:
1. Job Title
2. Position Duration
3. Position Location
4. Job Description
5. Job Responsibilities
6. Required Qualifications
7. Remuneration
8. Application Deadline
9. About Company
'''

Jobpost = []
list_extraction = ['job title', 'position duration', 'position location', 
                   'job description', 'job responsibilites', 'required qualifications',
                   'remuneration', 'application deadline', 'about company']
for i, item in enumerate(kaggle_data):
    if i == 0:
        continue
    jobpost = item[0].split('\n')
    extract_data = {}
    for data in jobpost:
        for name in list_extraction:
            try:
                splitted = name.upper().split(' ')
                regex = splitted[1]
                if re.search(regex, data):
                    extract_data[name.replace(' ', '_')] = data
            except:
                if re.search(name.upper(), data):
                    extract_data[name] = data
    Jobpost.append(extract_data)

In [39]:
# Extracted data from jobpost for 5 sample
print(Jobpost[:5])

[{'position_location': 'POSITION LOCATION: Yerevan, Armenia', 'application_deadline': 'APPLICATION DEADLINE:   26 January 2004', 'required_qualifications': 'REQUIRED QUALIFICATIONS:  To perform this job successfully, an', 'job_description': 'JOB DESCRIPTION:   AMERIA Investment Consulting Company is seeking a', 'job_title': 'JOB TITLE:  Chief Financial Officer'}, {'required_qualifications': 'REQUIRED QUALIFICATIONS:  ', 'position_duration': 'DURATION:  3 months', 'remuneration': 'REMUNERATION:  Commensurate with experience.', 'application_deadline': 'APPLICATION DEADLINE:   12 January 2004', 'position_location': 'LOCATION:  IREX Armenia Main Office; Yerevan, Armenia ', 'job_description': 'DESCRIPTION:   IREX currently seeks to fill the position of a paid', 'about_company': 'ABOUT COMPANY:   The International Research & Exchanges Board (IREX) is', 'job_title': 'TITLE:   Full-time Community Connections Intern (paid internship)'}, {'required_qualifications': 'REQUIRED QUALIFICATIONS:  ', 

In [70]:
index_year = kaggle_data[0].index('Year')
max_year = max([item[index_year] for i, item in enumerate(kaggle_data) if i > 0])
company_year = [(x[0].split('\n')[0], x[index_year]) for i, x in enumerate(kaggle_data) if i > 0]

# 5 Sample of company and year
print(company_year[:5])

# Maximum year recorded in the post
print('Maximum year: {}'.format(max_year))

[('AMERIA Investment Consulting Company', '2004'), ('International Research & Exchanges Board (IREX)', '2004'), ('Caucasus Environmental NGO Network (CENN)', '2004'), ('Manoff Group', '2004'), ('Yerevan Brandy Company', '2004')]
Maximum year: 2015


In [77]:
grouped = collections.defaultdict(list)
for company, year in company_year:
    if ((int(max_year) - int(year)) <= 2):
        grouped[company].append(1)
        
analyzed = [(company, sum(count)) for company, count in grouped.iteritems()]
company_most_ads = max(analyzed, key=lambda item:item[1])

In [78]:
# most company with ads in the last 2 years
print(company_most_ads)

('ArmenTel CJSC', 127)


In [80]:
index_month = kaggle_data[0].index('Month')
grouped_month = collections.defaultdict(list)
all_month = [item[index_month] for i, item in enumerate(kaggle_data) if i > 0]
for month in all_month:
    grouped_month[month].append(1)

analyzed_month = [(month, sum(count)) for month, count in grouped_month.iteritems()]
month_most_ads = max(analyzed_month, key=lambda item:item[1])

In [81]:
# Month with largest number of job ads
print(month_most_ads)

('3', 1702)
