In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import warnings
warnings.filterwarnings('ignore')
from scipy.sparse import hstack, csr_matrix

In [2]:
train = pd.read_csv('Final_Train_Dataset.csv')
test = pd.read_csv('Final_Test_Dataset.csv')

In [3]:
train.head(2)

Unnamed: 0.1,Unnamed: 0,experience,job_description,job_desig,job_type,key_skills,location,salary,company_name_encoded
0,0,5-7 yrs,Exp: Minimum 5 years;Good understanding of IOC...,Senior Exploit and Vulnerability Researcher,,"team skills, communication skills, analytical ...",Delhi NCR(Vikas Puri),6to10,3687
1,1,10-17 yrs,He should have handled a team of atleast 5-6 d...,Head SCM,,"ppc, logistics, inventory management, supply c...",Sonepat,10to15,458


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19802 entries, 0 to 19801
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            19802 non-null  int64 
 1   experience            19802 non-null  object
 2   job_description       15384 non-null  object
 3   job_desig             19802 non-null  object
 4   job_type              4797 non-null   object
 5   key_skills            19801 non-null  object
 6   location              19802 non-null  object
 7   salary                19802 non-null  object
 8   company_name_encoded  19802 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 1.4+ MB


In [5]:
train = train.dropna(subset=['key_skills'])

df_train = train[['key_skills', 'job_desig', 'job_description', 'location', 'job_type', 'experience','salary']]
df_test = test[['key_skills', 'job_desig', 'job_description', 'job_type', 'experience', 'location']]

In [6]:
def clean_skills(skl):
    skills = str(skl).lower()
    skills = re.sub(r'\...','', skills)
    skills = re.sub(r',','', skills)
    skills = re.sub(r'\s+', ' ', skills)
    return skills

In [7]:
#samp=clean_skills('ppc, logistics, inventory management')
df_train['skills_cleaned'] = df_train['key_skills'].apply(clean_skills)
df_test['skills_cleaned'] = df_test['key_skills'].apply(clean_skills)

In [8]:
df_train.head()

Unnamed: 0,key_skills,job_desig,job_description,location,job_type,experience,salary,skills_cleaned
0,"team skills, communication skills, analytical ...",Senior Exploit and Vulnerability Researcher,Exp: Minimum 5 years;Good understanding of IOC...,Delhi NCR(Vikas Puri),,5-7 yrs,6to10,team skills communication skills analytical sk...
1,"ppc, logistics, inventory management, supply c...",Head SCM,He should have handled a team of atleast 5-6 d...,Sonepat,,10-17 yrs,10to15,ppc logistics inventory management supply chai...
2,"HR Analytics, Employee Engagement, Training, S...",Deputy Manager - Talent Management & Leadershi...,Must be an effective communicator (written & s...,Delhi NCR,Analytics,5-9 yrs,15to25,hr analytics employee engagement training succ...
3,"SQL, Javascript, Automation, Python, Ruby, Ana...",Associate Manager Data Engineering,7 - 10 years of overall experience in data e...,Bengaluru,Analytics,7-10 yrs,10to15,sql javascript automation python ruby analytic...
4,"accounting, finance, cash flow, financial plan...",TS- GSA- Senior Analyst,Chartered Accountancy degree or MBA in Finance...,Gurgaon,,1-3 yrs,3to6,accounting finance cash flow financial plannin...


In [9]:
def clean_job_desig(desig):
    job_desig = desig.lower()
    job_desig = re.sub(r'[^a-z]', ' ', job_desig)
    job_desig = re.sub(r'\s+', ' ', job_desig)
    return job_desig

In [10]:
#samp=clean_job_desig('Senior Exploit and Vulnerability Researcher')
df_train['desig_cleaned'] = df_train['job_desig'].apply(clean_job_desig)
df_test['desig_cleaned'] = df_test['job_desig'].apply(clean_job_desig)

In [11]:
df_train.head(2)

Unnamed: 0,key_skills,job_desig,job_description,location,job_type,experience,salary,skills_cleaned,desig_cleaned
0,"team skills, communication skills, analytical ...",Senior Exploit and Vulnerability Researcher,Exp: Minimum 5 years;Good understanding of IOC...,Delhi NCR(Vikas Puri),,5-7 yrs,6to10,team skills communication skills analytical sk...,senior exploit and vulnerability researcher
1,"ppc, logistics, inventory management, supply c...",Head SCM,He should have handled a team of atleast 5-6 d...,Sonepat,,10-17 yrs,10to15,ppc logistics inventory management supply chai...,head scm


In [12]:
train['job_description'].fillna('missing', inplace=True)
test['job_description'].fillna('missing', inplace=True)

In [13]:
def clean_job_desc(job):
    job_desc = str(job).lower()
    job_desc = re.sub(r'[^a-z]', ' ', job_desc)
    job_desc = re.sub(r'\s+', ' ', job_desc)
    return job_desc

In [14]:
df_train['job_desc_cleaned'] = df_train['job_description'].apply(clean_job_desc)
df_test['job_desc_cleaned'] = df_test['job_description'].apply(clean_job_desc)

In [15]:
df_train.head(2)

Unnamed: 0,key_skills,job_desig,job_description,location,job_type,experience,salary,skills_cleaned,desig_cleaned,job_desc_cleaned
0,"team skills, communication skills, analytical ...",Senior Exploit and Vulnerability Researcher,Exp: Minimum 5 years;Good understanding of IOC...,Delhi NCR(Vikas Puri),,5-7 yrs,6to10,team skills communication skills analytical sk...,senior exploit and vulnerability researcher,exp minimum years good understanding of ioc ru...
1,"ppc, logistics, inventory management, supply c...",Head SCM,He should have handled a team of atleast 5-6 d...,Sonepat,,10-17 yrs,10to15,ppc logistics inventory management supply chai...,head scm,he should have handled a team of atleast direc...


In [16]:
def clean_location(loc):
    location = loc.lower()
    location = re.sub(r'[^a-z]', ' ', location)
    location = re.sub(r'\s+', ' ', location)
    return location

df_train['loc_cleaned'] = df_train['location'].apply(clean_location)
df_test['loc_cleaned'] = df_test['location'].apply(clean_location)

In [17]:
train['job_type'].fillna('missingjobtype', inplace=True)
train['job_type'].replace('Analytics', 'analytics', inplace=True)
train['job_type'].replace('Analytic', 'analytics', inplace=True)
train['job_type'].replace('ANALYTICS', 'analytics', inplace=True)
train['job_type'].replace('analytic', 'analytics', inplace=True)

test['job_type'].fillna('missingjobtype', inplace=True)
test['job_type'].replace('Analytics', 'analytics', inplace=True)
test['job_type'].replace('Analytic', 'analytics', inplace=True)
test['job_type'].replace('ANALYTICS', 'analytics', inplace=True)
test['job_type'].replace('analytic', 'analytics', inplace=True)

df_train['job_type_cleaned'] = train['job_type'] 
df_test['job_type_cleaned'] = test['job_type']

In [18]:
df_train.head(2)

Unnamed: 0,key_skills,job_desig,job_description,location,job_type,experience,salary,skills_cleaned,desig_cleaned,job_desc_cleaned,loc_cleaned,job_type_cleaned
0,"team skills, communication skills, analytical ...",Senior Exploit and Vulnerability Researcher,Exp: Minimum 5 years;Good understanding of IOC...,Delhi NCR(Vikas Puri),,5-7 yrs,6to10,team skills communication skills analytical sk...,senior exploit and vulnerability researcher,exp minimum years good understanding of ioc ru...,delhi ncr vikas puri,missingjobtype
1,"ppc, logistics, inventory management, supply c...",Head SCM,He should have handled a team of atleast 5-6 d...,Sonepat,,10-17 yrs,10to15,ppc logistics inventory management supply chai...,head scm,he should have handled a team of atleast direc...,sonepat,missingjobtype


In [19]:
def min_exp(exp):
    val = re.sub(r'\-',' ', exp)
    val = val.split(' ')
    val = int(val[0])
    return val

def max_exp(exp):
    val = re.sub(r'\-',' ', exp)
    val = val.split(' ')
    val = int(val[1])
    return val

In [20]:
min_exp('10-17 yrs')

10

In [21]:
df_train['min_exp'] = df_train['experience'].apply(min_exp)
df_train['max_exp'] = df_train['experience'].apply(max_exp)

df_test['min_exp'] = df_test['experience'].apply(min_exp)
df_test['max_exp'] = df_test['experience'].apply(max_exp)

In [22]:
df_train.head(2)

Unnamed: 0,key_skills,job_desig,job_description,location,job_type,experience,salary,skills_cleaned,desig_cleaned,job_desc_cleaned,loc_cleaned,job_type_cleaned,min_exp,max_exp
0,"team skills, communication skills, analytical ...",Senior Exploit and Vulnerability Researcher,Exp: Minimum 5 years;Good understanding of IOC...,Delhi NCR(Vikas Puri),,5-7 yrs,6to10,team skills communication skills analytical sk...,senior exploit and vulnerability researcher,exp minimum years good understanding of ioc ru...,delhi ncr vikas puri,missingjobtype,5,7
1,"ppc, logistics, inventory management, supply c...",Head SCM,He should have handled a team of atleast 5-6 d...,Sonepat,,10-17 yrs,10to15,ppc logistics inventory management supply chai...,head scm,he should have handled a team of atleast direc...,sonepat,missingjobtype,10,17


In [23]:
df_train['merged'] = (df_train['desig_cleaned'] + ' ' + df_train['job_desc_cleaned'] + ' ' + df_train['skills_cleaned']
                      + ' ' + df_train['job_type_cleaned'])

df_test['merged'] = (df_test['desig_cleaned'] + ' ' + df_test['job_desc_cleaned'] + ' ' + df_test['skills_cleaned']
                     + ' ' + df_test['job_type_cleaned'])

In [24]:
df_train.head(2)

Unnamed: 0,key_skills,job_desig,job_description,location,job_type,experience,salary,skills_cleaned,desig_cleaned,job_desc_cleaned,loc_cleaned,job_type_cleaned,min_exp,max_exp,merged
0,"team skills, communication skills, analytical ...",Senior Exploit and Vulnerability Researcher,Exp: Minimum 5 years;Good understanding of IOC...,Delhi NCR(Vikas Puri),,5-7 yrs,6to10,team skills communication skills analytical sk...,senior exploit and vulnerability researcher,exp minimum years good understanding of ioc ru...,delhi ncr vikas puri,missingjobtype,5,7,senior exploit and vulnerability researcher ex...
1,"ppc, logistics, inventory management, supply c...",Head SCM,He should have handled a team of atleast 5-6 d...,Sonepat,,10-17 yrs,10to15,ppc logistics inventory management supply chai...,head scm,he should have handled a team of atleast direc...,sonepat,missingjobtype,10,17,head scm he should have handled a team of atle...


In [25]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_train['salary'] = le.fit_transform(df_train['salary'])

In [26]:
le.classes_

array(['0to3', '10to15', '15to25', '25to50', '3to6', '6to10'],
      dtype=object)

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(
    df_train[['merged', 'loc_cleaned', 'min_exp', 'max_exp']], 
    df_train['salary'], test_size=0.20, 
    stratify=df_train['salary'], random_state=75)

In [28]:
print('No. of sample texts X_train: ', len(X_train))
print('No. of sample texts X_cv   : ', len(X_cv))

No. of sample texts X_train:  15840
No. of sample texts X_cv   :  3961


In [29]:
X_train_merged = X_train['merged']
X_train_loc = X_train['loc_cleaned']

X_cv_merged = X_cv['merged']
X_cv_loc = X_cv['loc_cleaned']

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [31]:
tf1 = TfidfVectorizer(min_df=3, token_pattern=r'\w{3,}', ngram_range=(1,3), max_df=0.9)
tf2 = TfidfVectorizer(min_df=2, token_pattern=r'\w{3,}')

X_train_merged = tf1.fit_transform(X_train_merged)
X_train_loc = tf2.fit_transform(X_train_loc)

X_cv_merged = tf1.transform(X_cv_merged)
X_cv_loc = tf2.transform(X_cv_loc)

In [32]:
from scipy import sparse
from sklearn.preprocessing import StandardScaler

In [33]:
sc1 = StandardScaler()

In [34]:
X_train_MinExp = sc1.fit_transform(np.array(X_train['min_exp']).reshape(-1,1))

In [35]:
X_cv_MinExp = sc1.transform(np.array(X_cv['min_exp']).reshape(-1,1))
X_train_MinExp = sparse.csr_matrix(X_train_MinExp)
X_cv_MinExp = sparse.csr_matrix(X_cv_MinExp)

In [36]:
sc2 = StandardScaler()
X_train_MaxExp = sc2.fit_transform(np.array(X_train['max_exp']).reshape(-1,1))
X_cv_MaxExp = sc2.transform(np.array(X_cv['max_exp']).reshape(-1,1))
X_train_MaxExp = sparse.csr_matrix(X_train_MaxExp)
X_cv_MaxExp = sparse.csr_matrix(X_cv_MaxExp)

In [37]:
merged_train = hstack((X_train_merged, X_train_loc, X_train_MinExp, X_train_MaxExp))
merged_cv  = hstack((X_cv_merged, X_cv_loc, X_cv_MinExp, X_cv_MaxExp))

In [38]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [39]:
import lightgbm as lgb
train_data = lgb.Dataset(merged_train, label=y_train)
test_data = lgb.Dataset(merged_cv, label=y_cv)

In [40]:
param = {'objective': 'multiclass',
         'num_iterations': 80,
         'learning_rate': 0.04,  
         'num_leaves': 23,
         'max_depth': 7, 
         'min_data_in_leaf': 28, 
         'max_bin': 10, 
         'min_data_in_bin': 3,   
         'num_class': 6,
         'metric': 'multi_logloss'
         }

In [41]:
lgbm = lgb.train(params=param,
                 train_set=train_data,
                 num_boost_round=120,
                 valid_sets=[test_data])

y_pred_class = lgbm.predict(merged_cv)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 29632
[LightGBM] [Info] Number of data points in the train set: 15840, number of used features: 3005
[LightGBM] [Info] Start training from score -1.808567
[LightGBM] [Info] Start training from score -1.481605
[LightGBM] [Info] Start training from score -1.568616
[LightGBM] [Info] Start training from score -2.531427
[LightGBM] [Info] Start training from score -1.947616
[LightGBM] [Info] Start training from score -1.724030
[1]	valid_0's multi_logloss: 1.70572
[2]	valid_0's multi_logloss: 1.67071
[3]	valid_0's multi_logloss: 1.63985
[4]	valid_0's multi_logloss: 1.61233
[5]	valid_0's multi_logloss: 1.58766
[6]	valid_0's multi_logloss: 1.56483
[7]	valid_0's multi_logloss: 1.54427
[8]	valid_0's multi_logloss: 1.52547
[9]	valid_0's multi_logloss: 1.50792
[10]	valid_0's multi_logloss: 1.49141
[11]	valid_0's multi_logloss: 1.47653
[12]	valid_0's 

In [42]:
predictions = []
for x in y_pred_class:
    predictions.append(np.argmax(x))

print('accuracy:', accuracy_score(y_cv, predictions))

accuracy: 0.48119161827821255


In [43]:
#test data

X_train_merged = df_train['merged']
X_train_loc = df_train['loc_cleaned']

X_test_merged = df_test['merged']
X_test_loc = df_test['loc_cleaned']

y_train = df_train['salary']

In [44]:
tf1 = TfidfVectorizer(min_df=3, token_pattern=r'\w{3,}', ngram_range=(1,3))
tf2 = TfidfVectorizer(min_df=2, token_pattern=r'\w{3,}')

X_train_merged = tf1.fit_transform(X_train_merged)
X_train_loc = tf2.fit_transform(X_train_loc)

X_test_merged = tf1.transform(X_test_merged)
X_test_loc = tf2.transform(X_test_loc)

In [45]:
from scipy import sparse
from sklearn.preprocessing import StandardScaler

sc1 = StandardScaler()
X_train_MinExp = sc1.fit_transform(np.array(df_train['min_exp']).reshape(-1,1))
X_test_MinExp = sc1.transform(np.array(df_test['min_exp']).reshape(-1,1))
X_train_MinExp = sparse.csr_matrix(X_train_MinExp)
X_test_MinExp = sparse.csr_matrix(X_test_MinExp)

sc2 = StandardScaler()
X_train_MaxExp = sc2.fit_transform(np.array(df_train['max_exp']).reshape(-1,1))
X_test_MaxExp = sc2.transform(np.array(df_test['max_exp']).reshape(-1,1))
X_train_MaxExp = sparse.csr_matrix(X_train_MaxExp)
X_test_MaxExp = sparse.csr_matrix(X_test_MaxExp)

In [46]:
merged_train = hstack((X_train_merged, X_train_loc, X_train_MinExp, X_train_MaxExp))
merged_test  = hstack((X_test_merged, X_test_loc, X_test_MinExp, X_test_MaxExp))

In [47]:
import lightgbm as lgb
train_data = lgb.Dataset(merged_train, label=y_train)

param = {'objective': 'multiclass',
         'num_iterations': 80,
         'learning_rate': 0.04, 
         'num_leaves': 23,
         'max_depth': 7, 
         'min_data_in_leaf': 28, 
         'max_bin': 10, 
         'min_data_in_bin': 3,   
         'num_class': 6,
         'metric': 'multi_logloss'
         }

lgbm = lgb.train(params=param, 
                 train_set=train_data)

predictions = lgbm.predict(merged_test)

y_pred_class = []
for x in predictions:
    y_pred_class.append(np.argmax(x))

y_pred_class = le.inverse_transform(y_pred_class)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37834
[LightGBM] [Info] Number of data points in the train set: 19801, number of used features: 3848
[LightGBM] [Info] Start training from score -1.808617
[LightGBM] [Info] Start training from score -1.481655
[LightGBM] [Info] Start training from score -1.568666
[LightGBM] [Info] Start training from score -2.531477
[LightGBM] [Info] Start training from score -1.947578
[LightGBM] [Info] Start training from score -1.723868


In [48]:
df_sub = pd.DataFrame(data=y_pred_class, columns=['salary'])

In [49]:
df_sub

Unnamed: 0,salary
0,10to15
1,0to3
2,6to10
3,0to3
4,0to3
...,...
6596,10to15
6597,6to10
6598,15to25
6599,15to25


In [50]:
writer = pd.ExcelWriter('output.xlsx', engine='xlsxwriter')
df_sub.to_excel(writer,sheet_name='Sheet1', index=False)
writer.save()