In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import PowerTransformer
import re
import textstat
import stopwords
import seaborn as sns
from textatistic import Textatistic
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag
from textblob import TextBlob

In [2]:
test_df = pd.read_csv("../Data/test_data.csv")
train_df = pd.read_csv("../Data/train_data.csv")

In [3]:
combined_df = pd.concat([train_df, test_df], axis=0)

In [4]:
columns_to_keep = ['job_id', 'required_experience', 'required_education', 'requirements', 'fraudulent']

# Drop columns that are not in the list
processed_df = combined_df.copy()[columns_to_keep]

### Categorical Features (included with Null vs Not Null)

In [5]:
values_to_map = [
    "Master's Degree",
    'Some College Coursework Completed',
    'Vocational',
    'Certification',
    'Associate Degree',
    'Professional',
    'Doctorate',
    'Some High School Coursework',
    'Vocational - Degree',
    'Vocational - HS Diploma'
]

# replace the values with "Others"
processed_df['required_education'] = processed_df['required_education'].replace(values_to_map, 'Others')

In [6]:
values_to_map = [
    'Internship',
    'Executive',
    'Director'
]

# Replace the listed values with "Others"
processed_df['required_experience'] = processed_df['required_experience'].replace(values_to_map, 'Others')

In [7]:
one_hot_encoded_exp = pd.get_dummies(processed_df['required_experience'], prefix='Experience', dummy_na=True)
one_hot_encoded_edu = pd.get_dummies(processed_df['required_education'], prefix='Education', dummy_na=True)

one_hot_encoded_exp = one_hot_encoded_exp.astype(int)
one_hot_encoded_edu = one_hot_encoded_edu.astype(int)

processed_df = pd.concat([processed_df, one_hot_encoded_exp, one_hot_encoded_edu], axis=1)

### Sentiment Scores (Normalised range from -1 to 1)

In [8]:
processed_df['requirements'] = processed_df['requirements'].astype(str)

In [9]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sentiments = SentimentIntensityAnalyzer()
sentiment_results = processed_df['requirements'].apply(sentiments.polarity_scores)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\seewe\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [10]:
processed_df["req_sentiment"] = list(map(lambda x: x['compound'], sentiment_results))

In [11]:
processed_df.head(2)

Unnamed: 0,job_id,required_experience,required_education,requirements,fraudulent,Experience_Associate,Experience_Entry level,Experience_Mid-Senior level,Experience_Not Applicable,Experience_Others,Experience_nan,Education_Bachelor's Degree,Education_High School or equivalent,Education_Others,Education_Unspecified,Education_nan,req_sentiment
0,1500,,,requirements experience with builder1440 sales...,0,0,0,0,0,0,1,0,0,0,0,1,0.0
1,4043,Entry level,Bachelor's Degree,requirements high diploma or degree holder pre...,0,0,1,0,0,0,0,1,0,0,0,0,0.9559


### Split combined DF to train + test to prevent data leakage

In [12]:
train_df_encoded = processed_df[:len(train_df)]
test_df_encoded = processed_df[len(train_df):]

### Text Length (Log Transformed + Normalised range with Mean = 0 & SD = 1)

In [13]:
train_df_encoded["requirements_length"] = train_df_encoded['requirements'].apply(len)

# log transformation + normalisation to "requirements_length"
train_df_encoded['req_text_length_log'] = np.log1p(train_df_encoded['requirements_length'])

scaler = StandardScaler()
train_df_encoded['req_text_length_log_scaled'] = scaler.fit_transform(train_df_encoded[['req_text_length_log']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_encoded["requirements_length"] = train_df_encoded['requirements'].apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_encoded['req_text_length_log'] = np.log1p(train_df_encoded['requirements_length'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_encoded['req_text_

In [14]:
test_df_encoded["requirements_length"] = test_df_encoded['requirements'].apply(len)

# log transformation + normalisation to "requirements_length"
test_df_encoded['req_text_length_log'] = np.log1p(test_df_encoded['requirements_length'])

test_df_encoded['req_text_length_log_scaled'] = scaler.transform(test_df_encoded[['req_text_length_log']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_encoded["requirements_length"] = test_df_encoded['requirements'].apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_encoded['req_text_length_log'] = np.log1p(test_df_encoded['requirements_length'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_encoded['req_text_lengt

In [15]:
test_df_encoded.drop(["req_text_length_log", "requirements_length", "fraudulent", "requirements", "required_education", "required_experience", "job_id"], axis=1, inplace=True)
train_df_encoded.drop(["req_text_length_log", "requirements_length", "fraudulent", "requirements", "required_education", "required_experience", "job_id"], axis=1, inplace=True)

# Gerald's part is ok
# Use test_df_encoded and train_df_encoded

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_encoded.drop(["req_text_length_log", "requirements_length", "fraudulent", "requirements", "required_education", "required_experience", "job_id"], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_encoded.drop(["req_text_length_log", "requirements_length", "fraudulent", "requirements", "required_education", "required_experience", "job_id"], axis=1, inplace=True)


## Jethro's Columns

In [16]:
train = train_df.copy(deep=True)
test = test_df.copy(deep=True)
combined = pd.concat([train, test], axis=0)

columns_to_keep = ['location', 'description']
processed = combined.copy()[columns_to_keep]
# print(processed.tail())

processed = processed.reset_index(drop=True)
# print(processed.tail())


country_state_city = list(processed['location'].str.split(', ').values)
for idx, row in enumerate(country_state_city):
    if row is np.nan:
        country_state_city[idx] = [''] * 3
    elif len(row) > 3:
        country_state_city[idx] = row[:2] + ['/ '.join(row[2:])] #join everything after State as one entry delimited by /
        
    elif len(row) < 3:
        country_state_city[idx] += [''] * 2
loc_data = pd.DataFrame(country_state_city, columns=['Country', 'State', 'City'])

In [17]:
# one_hot_encoded_country = pd.get_dummies(loc_data['Country'], prefix='Country', dummy_na=True)
# one_hot_encoded_state = pd.get_dummies(loc_data['State'], prefix='State', dummy_na=True)
# one_hot_encoded_city = pd.get_dummies(loc_data['City'], prefix='City', dummy_na=True)

# # one_hot_encoded_country = one_hot_encoded_country.reset_index(drop=True)
# # one_hot_encoded_state = one_hot_encoded_state.reset_index(drop=True)
# # one_hot_encoded_city = one_hot_encoded_city.reset_index(drop=True)

# one_hot_encoded_country = one_hot_encoded_country.astype(int)
# one_hot_encoded_state = one_hot_encoded_state.astype(int)
# one_hot_encoded_city = one_hot_encoded_city.astype(int)


# processed = pd.concat([processed, one_hot_encoded_country, one_hot_encoded_state, one_hot_encoded_city], axis=1)

# processed.drop(columns = ["location"], inplace = True)


In [18]:
from category_encoders import CountEncoder

count_enc = CountEncoder(normalize=True)
count_encoded = count_enc.fit_transform(loc_data[['Country','State','City']])
count_encoded = count_encoded.add_suffix('_counts')
processed = pd.concat([processed,count_encoded,loc_data], axis=1)

processed.drop(columns = ["location"], inplace = True)

In [19]:
processed.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   description     17870 non-null  object 
 1   Country_counts  17880 non-null  float64
 2   State_counts    17880 non-null  float64
 3   City_counts     17880 non-null  float64
 4   Country         17880 non-null  object 
 5   State           17880 non-null  object 
 6   City            17880 non-null  object 
dtypes: float64(3), object(4)
memory usage: 977.9+ KB


In [20]:
processed.head()

Unnamed: 0,description,Country_counts,State_counts,City_counts,Country,State,City
0,job description job title business analystdura...,0.595973,0.05453,0.001566,US,TX,Fort Worth
1,our client is a leading exhibition and events ...,0.004306,0.144295,0.115604,HK,,
2,everything we build needs to work flawlessly f...,0.006376,0.144295,0.115604,IE,,
3,we are looking for passionate individuals who ...,0.133333,0.144295,0.115604,GB,,
4,you will interface with both customers and our...,0.595973,0.144295,0.115604,US,,


In [21]:
train_encoded = processed[:len(train)]
test_encoded = processed[len(train):]

In [22]:
# train_encoded.head()

In [23]:
# print(one_hot_encoded_country.shape)
# print(one_hot_encoded_city.shape)
# print(one_hot_encoded_state.shape)

### Generate Description sentence length column

In [24]:
train_encoded['description'].fillna('',inplace=True)
test_encoded['description'].fillna('',inplace=True)
train_encoded['sentence_length'] = train_encoded['description'].apply(lambda x: len(x.split()))
test_encoded['sentence_length'] = test_encoded['description'].apply(lambda x: len(x.split()))

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

train_encoded['sentence_length'] = train_encoded['sentence_length'].values.reshape(-1, 1)
test_encoded['sentence_length'] = test_encoded['sentence_length'].values.reshape(-1, 1)
scaler.fit(train_encoded[['sentence_length']])
train_encoded['sentence_length_scaled'] = scaler.transform(train_encoded[['sentence_length']])
test_encoded['sentence_length_scaled'] = scaler.transform(test_encoded[['sentence_length']])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded['description'].fillna('',inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_encoded['description'].fillna('',inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded['sentence_length'] = train_encoded['description'].apply(lambda x: len(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

In [25]:
train_encoded.drop(columns = ["description"], inplace = True)
test_encoded.drop(columns = ["description"], inplace = True)

# Use train_encoded and test_encoded

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded.drop(columns = ["description"], inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_encoded.drop(columns = ["description"], inplace = True)


In [26]:
train_encoded.head()

Unnamed: 0,Country_counts,State_counts,City_counts,Country,State,City,sentence_length,sentence_length_scaled
0,0.595973,0.05453,0.001566,US,TX,Fort Worth,64,0.051864
1,0.004306,0.144295,0.115604,HK,,,162,0.13128
2,0.006376,0.144295,0.115604,IE,,,92,0.074554
3,0.133333,0.144295,0.115604,GB,,,173,0.140194
4,0.595973,0.144295,0.115604,US,,,63,0.051053


TJ's Columns

In [3]:
train_tj = train_df.copy(deep=True)
test_tj = test_df.copy(deep=True)
combined_tj = pd.concat([train_tj, test_tj], axis=0)

columns_to_keep = ["function", "industry", "benefits", "salary_range"]
processed_tj = combined_tj.copy()[columns_to_keep]
# print(processed.tail())

processed_tj = processed_tj.reset_index(drop=True)

In [8]:
processed_tj["function"].fillna("None", inplace = True)
processed_tj["industry"].fillna("None", inplace = True)

# Recategorise the industries to Others if the count is less than a 100 in the dataset
processed_tj["industry_no_na"] = processed_tj["industry"]
industry_value_counts = processed_tj["industry"].value_counts()
low_count_categories_industry = industry_value_counts[industry_value_counts < 100].index.tolist()
processed_tj["Industry"] = processed_tj["industry_no_na"].apply(lambda x: 'Others' if x in low_count_categories_industry else x)

# Recategorise the functions to Others if the count is less than a 100 in the dataset
function_value_counts = processed_tj["function"].value_counts()
low_count_categories_function = function_value_counts[function_value_counts < 100].index.tolist()
processed_tj["function_no_na"] = processed_tj["function"]
processed_tj["Function"] = processed_tj["function_no_na"].apply(lambda x: 'Others' if x in low_count_categories_function else x)

processed_tj.drop(columns = ["function_no_na", "industry_no_na", "function", "industry"], inplace = True)

In [9]:
processed_tj.head()

Unnamed: 0,benefits,salary_range,Industry,Function
0,,,,
1,,,Others,
2,,,Others,Information Technology
3,potential to earn an executive level income fu...,,,
4,base salary best in industrybenefits full,,Others,


In [10]:
one_hot_encoded_function = pd.get_dummies(processed_tj['Function'], prefix='Function', dummy_na=True)
one_hot_encoded_industry = pd.get_dummies(processed_tj['Industry'], prefix='Industry', dummy_na=True)


In [11]:
one_hot_encoded_function = one_hot_encoded_function.astype(int)
one_hot_encoded_industry = one_hot_encoded_industry.astype(int)

In [12]:
processed_tj = pd.concat([processed_tj, one_hot_encoded_function, one_hot_encoded_industry], axis=1)

In [13]:
processed_tj.drop(columns = ["Industry", "Function"])

Unnamed: 0,benefits,salary_range,Function_Accounting/Auditing,Function_Administrative,Function_Art/Creative,Function_Business Development,Function_Consulting,Function_Customer Service,Function_Design,Function_Education,...,Industry_Marketing and Advertising,Industry_None,Industry_Oil & Energy,Industry_Online Media,Industry_Others,Industry_Real Estate,Industry_Retail,Industry_Staffing and Recruiting,Industry_Telecommunications,Industry_nan
0,,,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,,,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,,,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,potential to earn an executive level income fu...,,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,base salary best in industrybenefits full,,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,,110000-120000,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
17876,see job description,,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
17877,,,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
17878,competitive base salarystock optionsfull benef...,,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [15]:
train_encoded_tj = processed_tj[:len(train_tj)]
test_encoded_tj = processed_tj[len(train_tj):]

In [16]:
# Now for salary range for train_tj

train_encoded_tj['salary_range'] = train_encoded_tj['salary_range'].astype(str)
pattern = r'^\d+-\d+$'
#print(train_dataset.info())
mask = train_encoded_tj['salary_range'].str.match(pattern)
train_encoded_tj.loc[~mask, 'salary_range'] = None
#print(train_dataset.info())
train_encoded_tj[['lower', 'upper']] = train_encoded_tj['salary_range'].str.split('-', expand=True)
#print(train_dataset.info())
train_encoded_tj[['lower', 'upper']] = train_encoded_tj[['lower', 'upper']].apply(pd.to_numeric, errors='coerce')

train_encoded_tj['lower'] = train_encoded_tj['lower'].fillna(train_encoded_tj['lower'].mean())
train_encoded_tj['upper'] = train_encoded_tj['upper'].fillna(train_encoded_tj['upper'].mean())

train_encoded_tj["avg_salary"] = train_encoded_tj[["lower", "upper"]].mean(axis = 1)
#print(train_dataset.info())
train_encoded_tj["salary_given"] = train_encoded_tj["salary_range"].apply(lambda x: False if x == None else True)
train_encoded_tj["salary_given"] = train_encoded_tj["salary_given"].astype(int)
train_encoded_tj["salary_range_calculated"] = train_encoded_tj["upper"] - train_encoded_tj["lower"]

train_encoded_tj["avg_salary"].fillna(-1, inplace = True)
train_encoded_tj["avg_salary"].replace(-1, float('nan'), inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded_tj['salary_range'] = train_encoded_tj['salary_range'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded_tj[['lower', 'upper']] = train_encoded_tj['salary_range'].str.split('-', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded_tj[['lower'

In [17]:
# Now for salary range for test_tj

test_encoded_tj['salary_range'] = test_encoded_tj['salary_range'].astype(str)
pattern = r'^\d+-\d+$'
#print(train_dataset.info())
mask = test_encoded_tj['salary_range'].str.match(pattern)
test_encoded_tj.loc[~mask, 'salary_range'] = None
#print(train_dataset.info())
test_encoded_tj[['lower', 'upper']] = test_encoded_tj['salary_range'].str.split('-', expand=True)
#print(train_dataset.info())
test_encoded_tj[['lower', 'upper']] = test_encoded_tj[['lower', 'upper']].apply(pd.to_numeric, errors='coerce')

test_encoded_tj['lower'] = test_encoded_tj['lower'].fillna(test_encoded_tj['lower'].mean())
test_encoded_tj['upper'] = test_encoded_tj['upper'].fillna(test_encoded_tj['upper'].mean())

test_encoded_tj["avg_salary"] = test_encoded_tj[["lower", "upper"]].mean(axis = 1)
#print(train_dataset.info())
test_encoded_tj["salary_given"] = test_encoded_tj["salary_range"].apply(lambda x: False if x == None else True)
test_encoded_tj["salary_given"] = test_encoded_tj["salary_given"].astype(int)
test_encoded_tj["salary_range_calculated"] = test_encoded_tj["upper"] - test_encoded_tj["lower"]

test_encoded_tj["avg_salary"].fillna(-1, inplace = True)
test_encoded_tj["avg_salary"].replace(-1, float('nan'), inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_encoded_tj['salary_range'] = test_encoded_tj['salary_range'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_encoded_tj[['lower', 'upper']] = test_encoded_tj['salary_range'].str.split('-', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_encoded_tj[['lower', 'up

In [18]:
test_encoded_tj.head()

Unnamed: 0,benefits,salary_range,Industry,Function,Function_Accounting/Auditing,Function_Administrative,Function_Art/Creative,Function_Business Development,Function_Consulting,Function_Customer Service,...,Industry_Real Estate,Industry_Retail,Industry_Staffing and Recruiting,Industry_Telecommunications,Industry_nan,lower,upper,avg_salary,salary_given,salary_range_calculated
12516,pay is 15 hr add tips and you make even more s...,,Others,,0,0,0,0,0,0,...,0,0,0,0,0,637392.94386,1037737.0,837565.053216,0,400344.218713
12517,our core values drive our culture this is what...,,Marketing and Advertising,Engineering,0,0,0,0,0,0,...,0,0,0,0,0,637392.94386,1037737.0,837565.053216,0,400344.218713
12518,,65000-80000,Accounting,Accounting/Auditing,1,0,0,0,0,0,...,0,0,0,0,0,65000.0,80000.0,72500.0,1,15000.0
12519,,,,Customer Service,0,0,0,0,0,1,...,0,0,0,0,0,637392.94386,1037737.0,837565.053216,0,400344.218713
12520,at fivesky our employees are our greatest asse...,,,,0,0,0,0,0,0,...,0,0,0,0,0,637392.94386,1037737.0,837565.053216,0,400344.218713


In [19]:
# Log normalisation for salary_range columns in train_tj

log = PowerTransformer()

for column in ["lower", "upper", "avg_salary", "salary_range_calculated"]:
    log.fit(train_encoded_tj[[column]])
    train_encoded_tj['log_' + column] = log.transform(train_encoded_tj[[column]])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded_tj['log_' + column] = log.transform(train_encoded_tj[[column]])
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_arra

In [20]:
# Log normalisation for salary_range columns in test_tj
log = PowerTransformer()
for column in ["lower", "upper", "avg_salary", "salary_range_calculated"]:
    log.fit(train_encoded_tj[[column]])
    test_encoded_tj['log_' + column] = log.transform(test_encoded_tj[[column]])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_encoded_tj['log_' + column] = log.transform(test_encoded_tj[[column]])
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_

In [21]:
# Now let's do benefits for train_tj

train_encoded_tj["benefits"] = train_encoded_tj["benefits"].fillna("")
def transform_text(text):
    return re.sub(r'(phone|email|url)_', r'\1 ', text)

# Apply the transformation to the 'text_column'
train_encoded_tj['benefits'] = train_encoded_tj['benefits'].apply(lambda x: transform_text(x) if pd.notnull(x) else x)

train_encoded_tj['num_char_benefits'] = train_encoded_tj['benefits'].apply(len)

# number of words
def word_count(string): 
    words = string.split()
    return len(words)

train_encoded_tj['num_words_benefits'] = train_encoded_tj['benefits'].apply(word_count)

# average word length
train_encoded_tj['avg_word_length_benefits'] = train_encoded_tj['num_char_benefits']/train_encoded_tj['num_words_benefits']
train_encoded_tj['avg_word_length_benefits'].replace([np.inf, -np.inf], 0, inplace=True)
train_encoded_tj['avg_word_length_benefits'].fillna(0, inplace = True)

train_encoded_tj["benefits_given"] = train_encoded_tj["benefits"].apply(lambda x: False if x == "" else True)
train_encoded_tj["benefits_given"] = train_encoded_tj["benefits_given"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded_tj["benefits"] = train_encoded_tj["benefits"].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded_tj['benefits'] = train_encoded_tj['benefits'].apply(lambda x: transform_text(x) if pd.notnull(x) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded_t

In [22]:
# Now let's do benefits for test_tj

test_encoded_tj["benefits"] = test_encoded_tj["benefits"].fillna("")
def transform_text(text):
    return re.sub(r'(phone|email|url)_', r'\1 ', text)

# Apply the transformation to the 'text_column'
test_encoded_tj['benefits'] = test_encoded_tj['benefits'].apply(lambda x: transform_text(x) if pd.notnull(x) else x)

test_encoded_tj['num_char_benefits'] = test_encoded_tj['benefits'].apply(len)

# number of words
def word_count(string): 
    words = string.split()
    return len(words)

test_encoded_tj['num_words_benefits'] = test_encoded_tj['benefits'].apply(word_count)

# average word length
test_encoded_tj['avg_word_length_benefits'] = test_encoded_tj['num_char_benefits']/test_encoded_tj['num_words_benefits']
test_encoded_tj['avg_word_length_benefits'].replace([np.inf, -np.inf], 0, inplace=True)
test_encoded_tj['avg_word_length_benefits'].fillna(0, inplace = True)

test_encoded_tj["benefits_given"] = test_encoded_tj["benefits"].apply(lambda x: False if x == "" else True)
test_encoded_tj["benefits_given"] = test_encoded_tj["benefits_given"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_encoded_tj["benefits"] = test_encoded_tj["benefits"].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_encoded_tj['benefits'] = test_encoded_tj['benefits'].apply(lambda x: transform_text(x) if pd.notnull(x) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_encoded_tj['nu

In [23]:
train_encoded_tj.drop(columns = ["Industry", "Function", "benefits", "salary_range", "lower", "upper", "avg_salary", "salary_range_calculated"], inplace = True)
test_encoded_tj.drop(columns = ["Industry", "Function", "benefits", "salary_range", "lower", "upper", "avg_salary", "salary_range_calculated"], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded_tj.drop(columns = ["Industry", "Function", "benefits", "salary_range", "lower", "upper", "avg_salary", "salary_range_calculated"], inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_encoded_tj.drop(columns = ["Industry", "Function", "benefits", "salary_range", "lower", "upper", "avg_salary", "salary_range_calculated"], inplace = True)


In [24]:
test_encoded_tj.head()


# use train_encoded_tj and test_encoded_tj

Unnamed: 0,Function_Accounting/Auditing,Function_Administrative,Function_Art/Creative,Function_Business Development,Function_Consulting,Function_Customer Service,Function_Design,Function_Education,Function_Engineering,Function_Finance,...,Industry_nan,salary_given,log_lower,log_upper,log_avg_salary,log_salary_range_calculated,num_char_benefits,num_words_benefits,avg_word_length_benefits,benefits_given
12516,0,0,0,0,0,0,0,0,0,0,...,0,0,0.811298,0.884935,0.859106,0.976827,205,44,4.659091,1
12517,0,0,0,0,0,0,0,0,1,0,...,0,0,0.811298,0.884935,0.859106,0.976827,1575,274,5.748175,1
12518,1,0,0,0,0,0,0,0,0,0,...,0,1,-1.519926,-1.667199,-1.619265,-1.856494,0,0,0.0,0
12519,0,0,0,0,0,1,0,0,0,0,...,0,0,0.811298,0.884935,0.859106,0.976827,0,0,0.0,0
12520,0,0,0,0,0,0,0,0,0,0,...,0,0,0.811298,0.884935,0.859106,0.976827,700,110,6.363636,1


Weiyang's columns

In [44]:
train_wy = train_df.copy()
test_wy = test_df.copy()

In [45]:
train_wy['company_profile'].fillna("", inplace=True)
test_wy['company_profile'].fillna("", inplace=True)

In [46]:
# Create a new column 'has_company_profile' with initial values of 0
train_wy['has_company_profile'] = 0
test_wy['has_company_profile'] = 0

# Use apply to check if 'company_profile' is NaN or an empty string
train_wy.loc[~train_wy['company_profile'].apply(lambda x: pd.isna(x) or x == ""), 'has_company_profile'] = 1
test_wy.loc[~test_wy['company_profile'].apply(lambda x: pd.isna(x) or x == ""), 'has_company_profile'] = 1

In [47]:
train_wy['profile_gunning_fog_score'] = train_wy['company_profile'].apply(lambda x: textstat.gunning_fog(x))
test_wy['profile_gunning_fog_score'] = test_wy['company_profile'].apply(lambda x: textstat.gunning_fog(x))

transformer = PowerTransformer()

train_wy["profile_gunning_fog_score"] = transformer.fit_transform(train_wy[["profile_gunning_fog_score"]])
test_wy["profile_gunning_fog_score"] = transformer.transform(test_wy[["profile_gunning_fog_score"]])

In [48]:
train_wy['company_profile_length'] = train_wy['tokenized_company_profile'].apply(lambda x: len(x))
test_wy['company_profile_length'] = test_wy['tokenized_company_profile'].apply(lambda x: len(x))

scaler = MinMaxScaler()

train_wy["company_profile_length"] = scaler.fit_transform(train_wy[["company_profile_length"]])
test_wy["company_profile_length"] = scaler.transform(test_wy[["company_profile_length"]])

In [49]:
def get_sentiment_score(text):
    analysis = TextBlob(text)
    # You can use the sentiment polarity or subjectivity as needed
    return analysis.sentiment.polarity

In [50]:
train_wy["company_profile_sentiment"] = train_wy['company_profile'].apply(get_sentiment_score)
test_wy["company_profile_sentiment"] = test_wy['company_profile'].apply(get_sentiment_score)

In [51]:
final_train_wy = train_wy[["has_company_logo", "has_company_profile", "profile_gunning_fog_score", "company_profile_length", "company_profile_sentiment"]]
final_test_wy = test_wy[["has_company_logo", "has_company_profile", "profile_gunning_fog_score", "company_profile_length", "company_profile_sentiment"]]

In [52]:
# ben_train = pd.read_csv("./glove_embeddings/ben_train_embeddings_glove.csv")
# desc_train = pd.read_csv("./glove_embeddings/desc_train_embeddings_glove.csv")
prof_train = pd.read_csv("./glove_embeddings/prof_train_embeddings_glove.csv")
# req_train = pd.read_csv("./glove_embeddings/req_train_embeddings_glove.csv")

In [53]:
# nlp_train = pd.concat([ben_train, desc_train, prof_train, req_train], axis = 1)

In [54]:
# ben_test = pd.read_csv("./glove_embeddings/ben_test_embeddings_glove.csv")
# desc_test = pd.read_csv("./glove_embeddings/desc_test_embeddings_glove.csv")
prof_test = pd.read_csv("./glove_embeddings/prof_test_embeddings_glove.csv")
# req_test = pd.read_csv("./glove_embeddings/req_test_embeddings_glove.csv")

In [55]:
# nlp_test = pd.concat([ben_test, desc_test, prof_test, req_test], axis = 1)

In [56]:
test_encoded = test_encoded.reset_index(drop=True)
test_encoded_tj = test_encoded_tj.reset_index(drop=True)

In [57]:
final_train = pd.concat([final_train_wy, prof_train, train_df_encoded, train_encoded, train_encoded_tj], axis = 1)
final_test = pd.concat([final_test_wy, prof_test, test_df_encoded, test_encoded, test_encoded_tj], axis = 1)

In [58]:
final_train = pd.concat([final_train, train_df['fraudulent']], axis = 1)
final_test = pd.concat([final_test, test_df['fraudulent']], axis = 1)

final_train = final_train.drop(columns = ["Country", "City", "State", "sentence_length"])

In [59]:
# print(train_encoded_tj.columns)
print(train_df_encoded.columns)

Index(['Experience_Associate', 'Experience_Entry level',
       'Experience_Mid-Senior level', 'Experience_Not Applicable',
       'Experience_Others', 'Experience_nan', 'Education_Bachelor's Degree',
       'Education_High School or equivalent', 'Education_Others',
       'Education_Unspecified', 'Education_nan', 'req_sentiment',
       'req_text_length_log_scaled'],
      dtype='object')


In [60]:
final_train.to_csv("../Data/ML_train_no_sample.csv", index = False)
final_test.to_csv("../Data/ML_test_no_sample.csv", index = False)

In [70]:
# 1. RUN DA UNDERSAMPLING CODE 
desired_distribution = 0.25

from imblearn.under_sampling import RandomUnderSampler
under_sample = RandomUnderSampler(random_state = 5, sampling_strategy=0.25) #20/80 fraudulent to non-fraudulent
X_train_under, y_train_under = under_sample.fit_resample(final_train.drop(['fraudulent'], axis = 1), final_train['fraudulent'])

final_train_sample = pd.concat([X_train_under, y_train_under], axis = 1)
final_train_sample.to_csv("../Data/ML_train_sample_random.csv", index = False)

# SMOTETomek (Oversample + Undersample)
from imblearn.combine import SMOTETomek
smt_tomek = SMOTETomek(n_jobs=-1, sampling_strategy=desired_distribution)
X_train_smt_tomek, y_train_smt_tomek = smt_tomek.fit_resample(final_train.drop(['fraudulent'], axis = 1), final_train['fraudulent'])

final_train_smt_tomek = pd.concat([X_train_smt_tomek, y_train_smt_tomek], axis = 1)
final_train_smt_tomek.to_csv("../Data/ML_train_sample_smt_tomek.csv", index = False)

#adasyn
from imblearn.over_sampling import ADASYN
ada = ADASYN(random_state=45, n_neighbors=5, sampling_strategy=desired_distribution)
X_train_ada, y_train_ada = ada.fit_resample(final_train.drop(['fraudulent'], axis = 1), final_train['fraudulent'])

final_train_ada = pd.concat([X_train_ada, y_train_ada], axis = 1)
final_train_ada.to_csv("../Data/ML_train_sample_ada.csv", index = False)

#borderlinesmote
from imblearn.over_sampling import BorderlineSMOTE
smt = BorderlineSMOTE(random_state=45, k_neighbors=6, sampling_strategy=desired_distribution)
X_train_smt, y_train_smt = smt.fit_resample(final_train.drop(['fraudulent'], axis = 1), final_train['fraudulent'])

final_train_smt = pd.concat([X_train_smt, y_train_smt], axis = 1)
final_train_smt.to_csv("../Data/ML_train_sample_smt.csv", index = False)

#RENN
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
renn = RepeatedEditedNearestNeighbours()
from imblearn.under_sampling import OneSidedSelection #onesidedselection
from imblearn.under_sampling import AllKNN
 # Assuming 0 is the majority class and 1 is the minority class

allknn = AllKNN()
X_train_allknn, y_train_allknn = allknn.fit_resample(final_train.drop(['fraudulent'], axis = 1), final_train['fraudulent'])

final_train_allknn = pd.concat([X_train_allknn, y_train_allknn], axis = 1)
final_train_allknn.to_csv("../Data/ML_train_sample_allknn.csv", index = False)
