In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import itertools
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

from nltk.corpus import stopwords

%matplotlib inline

pd.set_option('display.max_columns', 100)

# Dataset and parsing/cleaning of data

In [None]:
data = pd.read_csv('mycareersfuture_master.csv')

In [None]:
# Parse the salary columns

# Remove dollar sign and comma
for col in ['SalaryLower', 'SalaryUpper']:
    data[col] = [re.sub('[$,]', '', text) for text in data[col]]
    
# Extracting the text and placing into a new column 'SalaryType
data['SalaryType'] = data['SalaryUpper'].apply(lambda x: re.search('\D+', x).group())

# Extracting only the numerical value for the SalaryUpper column
data['SalaryUpper'] = data['SalaryUpper'].apply(lambda x: re.match('\d*', x).group())

# Replacing the text 'Salary undisclosed' with NaN
data[['SalaryLower', 'SalaryType']] = data[['SalaryLower', 'SalaryType']].replace('Salary undisclosed', np.nan)

# Replacing '' in SalaryUpper with NaN
data['SalaryUpper'] = data['SalaryUpper'].replace('', np.nan)

# Set the salary columns to numeric
data[['SalaryLower', 'SalaryUpper']] = data[['SalaryLower', 'SalaryUpper']].astype('float32')

# Convert the annual salary data into monthly salary data
data.loc[data['SalaryType']=='Annually', ['SalaryLower', 'SalaryUpper']] = \
data[data['SalaryType']=='Annually'][['SalaryLower', 'SalaryUpper']].apply(lambda x: x/12)

# Update the SalaryType to monthly
data.loc[data['SalaryType']=='Annually', 'SalaryType'] = 'Monthly'

# Create SalaryMean which is the mean of SalaryLower and SalaryUpper
data['SalaryMean'] = (data['SalaryLower'] + data['SalaryUpper']) / 2.

In [None]:
data['EmploymentType'].value_counts()

# Exclude all those job listings that are temporary, freelance, internship

In [None]:
# Create a list of those Employment Types to drop
emp_type_list = data['EmploymentType'].value_counts().index[7::]

# Iterate through each Employment Type and exclude them
for emp_type in emp_type_list:
    data = data[data['EmploymentType'] != emp_type].copy()

In [None]:
# Combine RoleDesc and Requirements
data['JD'] = data['RoleDesc'] + ' ' + data['Requirements']

In [None]:
# Parse dummy variables for columns, JobCategories, EmploymentType, Seniority
col_list = ['JobCategories', 'EmploymentType', 'Seniority']

for col in col_list:
    value_list = data[col].unique()
    value_list = [value.split(',') for value in value_list]
    value_list = list(itertools.chain(*value_list))
    value_list = [str.strip(value) for value in value_list]
    value_list = np.unique(value_list)
    
    # Create dummy variables for the column
    for value in value_list:
        data.loc[data[col].str.contains(value), value] = 1
        
    # Replace NaN with zero
    data[value_list] = data[value_list].fillna(value=0)
    

## Question 1

In [None]:
# Look at the distribution of SalaryMean
fig = plt.figure(figsize=(18,6))
ax1 = fig.add_subplot(121)
data['SalaryMean'].hist(ax=ax1, bins=15)
ax2 = fig.add_subplot(122)
data.boxplot(column='SalaryMean', ax=ax2)

In [None]:
data1 = data[~data['SalaryMean'].isnull()].copy()

In [None]:
# As the distribution of salary is right skewed with a long right tail,
# classify the salary into 2 groups - Above Median (1) and Median and Below (0) 

data1['SalaryLabel'] = data1['SalaryMean'].apply(lambda x: 1 if x > data1['SalaryMean'].median() else 0)

In [None]:
# check correlation of dummy variables with target

sector_var = pd.concat([data1.iloc[:,15:47].copy(), data1['SalaryLabel']], axis=1)
emptype_var = pd.concat([data1.iloc[:,47:50].copy(), data1['SalaryLabel']], axis=1)
senior_var = pd.concat([data1.iloc[:,50::].copy(), data1['SalaryLabel']], axis=1)

In [None]:
fig = plt.figure(figsize=(10,6))
sns.heatmap(senior_var.corr(), annot=True)

# the seniority features have little correlation with the target variable SalaryLabel

In [None]:
fig = plt.figure(figsize=(10,6))
sns.heatmap(emptype_var.corr(), annot=True)

# the EmploymentType features have little correlation with the target variable SalaryLabel

In [None]:
sector_corr = sector_var.corr()
print sector_corr.iloc[:, -1]

# the sector features have little correlation with the target variable SalaryLabel

Hence for the prediction of classification labels of salary, use only the job description  
and job requirements and discard the other features

### Set up predictor and target varibles plus split data for train and test

In [None]:
# copy JD column to as x variable, SalaryLabel as y

x = data1['JD'].copy()
y = data1['SalaryLabel'].copy()

In [None]:
y.shape

In [None]:
# baseline accuracy
print(1. - y.mean())

In [None]:
# Train test split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=29)


## Try first model of LogisticRegression to predict if the salary of a job posting will be above median or below median  
Use Logistic Regression as it is the most simple yet powerful classification model

In [None]:
# First model of Logistic Regression

# Tokenise the x_train which is job description and job requirements
cvt = CountVectorizer(ngram_range=(1,4))
tokens_train = pd.DataFrame(cvt.fit_transform(x_train).todense(), columns=cvt.get_feature_names())

# Tokenise the x_test using the same trained CountVectorizer
tokens_test = pd.DataFrame(cvt.transform(x_test).todense(), columns=cvt.get_feature_names())

lr = LogisticRegression(random_state=29)
lr.fit(tokens_train, y_train)

In [None]:
# Cross-validate the Logistic Regression model
scores = cross_val_score(lr, tokens_train, y_train, cv=5, n_jobs=-1, verbose=1)
print('Cross validation score: {}'.format(scores))
print('Mean score: {}'.format(np.mean(scores)))

# Score on test data
print('Score on test data: {}'.format(lr.score(tokens_test, y_test)))

In [None]:
# Tuning of LogisticRegression model for better results
params = {
    'C' : [0.2, 0.4, 0.6, 0.8, 1.0],
    'solver' : ['newton-cg', 'lbfgs', 'sag']
}

lr_gs = GridSearchCV(lr, param_grid=params, cv=3, n_jobs=-1, verbose=1)
lr_gs.fit(tokens_train, y_train)

In [None]:
# Second model of RandomForest
rfc = RandomForestClassifier(n_jobs=-1, random_state=29, verbose=1)
rfc.fit(tokens_train, y_train)


In [None]:
# Cross-validate the RandomForest model
scores = cross_val_score(salary_pipe, x_train, y_train, cv=5, n_jobs=-1)
print('Cross validation score: {}'.format(scores))
print('Mean score: {}'.format(np.mean(scores)))

# Score on test data
print('Score on test data: {}'.format(salary_pipe.score(x_test, y_test)))


In [None]:
# Tuning of RandomForest model for better results
params = {
    'n_estimators' : range(1 , 11, 2),
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [1, 2, 3, None]
}

rf_gs = GridSearchCV(RandomForestClassifier(random_state=29), param_grid=params, cv=3, n_jobs=-1)
rf_gs.fit(x_train, y_train)

## Question 2
Set up the target variable as IT sector versus non-IT sector jobs and build a classification model  
to predict it

In [None]:
# Set up y variable to the binary column "Information Technology"
y_2 = data['Information Technology'].copy()


In [None]:
data.head()

In [None]:
job_title_list = ['analyst', 'manager', 'consultant', 'fellow', 'specialist', 'director', 'designer',\
                  'lead', 'avp', 'vp', 'svp', 'developer', 'assistant', 'junior', 'engineer', 'scientist', \
                  'software', 'head', 'architect', 'chief', 'officer', 'administrator', 'executive', 'deputy']

snr_list = ['senior', 'snr', 'sr']
ass_list = ['assoc', 'associate']
research = ['research', 'researcher']

In [None]:
data['JobTitle'].unique()

In [None]:
# List of columns not required 
col_drop_list =['Company', 'SalaryLower', 'SalaryUpper', 'OfficeLocation', 'EmploymentType', 'Seniority',\
                'JobCategories', 'RoleDesc', 'Requirements', 'mcfURL', 'SalaryType', 'no information'])


In [None]:
data.head()

In [None]:
job_cat_list

In [None]:
data['JobCategories'].unique()

In [None]:
data.head()

In [None]:
data = data.drop(columns=['mcfURL', 'SalaryType'])

In [None]:
data.head()

In [None]:
data.head()

### Hypothesis 1:  
Build a linear regression model to predict the mean salary value  

Attempt 1 : Used job title to predict salary -> Result was bad, model does not predict well at all  

### Hypothesis 2:  
Build a classification model to predict the label

In [None]:
# Hypothesis 1: predict salary

# merge job description and job requirement together
x_data = df['job_description'] + df['job_requirement']

# tokenise job title separately
title_data = df['job_title'].copy()

# set up target variable
y = df['salary_mean'].copy()


In [None]:
stop_words = stopwords.words('english')


In [None]:
# tokenise job details
cvt = CountVectorizer(ngram_range=(1,4), stop_words=stop_words)
tokens = pd.DataFrame(cvt.fit_transform(x_data).todense(), columns=cvt.get_feature_names())


In [None]:
tokens.shape

In [None]:
# sum by columns and then sort by descending order to get the top 10 occuring word/n_gram
token_count = tokens.sum(axis=0).sort_values(ascending=False)
token_count.head(10)

In [None]:
# Split train and test data
x_train, x_test, y_train, y_test = train_test_split(tokens, y, test_size=0.1)

In [None]:
lr = LinearRegression()

lr.fit(x_train, y_train)
print('Cross val score: {}'.format(np.mean(cross_val_score(lr, x_train, y_train, cv=3, n_jobs=-1))))

In [None]:
lr.score(x_test, y_test)

In [None]:
# Tokenise job description and job requirement
cvt = CountVectorizer(ngram_range=(1,4), min_df=0.1, max_df=0.9)

cvt.fit(x_data)
x_tokens = pd.DataFrame(cvt.transform(x_data).todense(), columns=cvt.get_feature_names())

In [None]:
# sum to see which are the top occurences
phrase_count = x_tokens.sum(axis=0).sort_values(ascending=False)

In [None]:
df.shape

# Philly's dataset

In [None]:
df_philly = pd.read_csv('jobs_900.csv')

In [None]:
df_philly.head()

In [None]:
df_ph = df_philly.sample(n=650, axis=0, random_state=29)

In [None]:
df_ph.shape

In [None]:
df_ph.iloc[145,0]