In [39]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv('Company_Dataset/dataset_ml_labeled.csv')
data.head(3)

Unnamed: 0,Company,job_name,job_link,ML Labeled Function,city,country,Function,employment_type,remote,seniority level,Job Status,Date Reviewed,data analyst,company_Link,job_location,job_details,job_id,posting_error,description
0,Spectrum,"Outside Sales Representative | $5,000 Sign On ...",https://sjobs.brassring.com/TGnewUI/Search/hom...,Sales,Opelika,United States,Full-time,No,Mid-Senior Level,to be reviewed,,,,"$5,000 Sign On Bonus* + $2,500 training pay + ...",d74c82fb-27f8-435c-b54b-14ebccc7e9cd,,https://www.smartrecruiters.com/Humanity/74399...,,
1,Spectrum,Advertising Account Executive- New Business,https://sjobs.brassring.com/TGnewUI/Search/hom...,Sales,Bay City,United States,Full-time,No,Mid-Senior Level,to be reviewed,,,,Tenacious go-getter. Inquisitive problem solve...,11a21cd7-86c2-48a4-96b9-83fc59e428c1,,https://www.smartrecruiters.com/Humanity/74399...,,
2,Spectrum,"Editor, Media Ingest - Spectrum News Raleigh",https://sjobs.brassring.com/TGnewUI/Search/hom...,Writing/Editing,Raleigh,United States,Full-time,No,Mid-Senior Level,to be reviewed,,,,"Spectrum Networks is looking for enthusiastic,...",8e3143f9-e449-49fa-b2b7-a8fd30234c82,,https://www.smartrecruiters.com/Humanity/74399...,,


In [40]:
data['ML Labeled Function'].unique()

array(['Sales', 'Writing/Editing', 'Engineering',
       'Information Technology', 'Analyst', 'Human Resources',
       'Customer Service', 'Project Management', 'Accounting/Auditing',
       'Management', 'Production', 'Training', 'Public Relations',
       'Product Management', 'Design', 'Education', 'Finance',
       'General Business', 'Business Development', 'Marketing',
       'Administrative', 'Art/Creative', 'Legal', 'Quality Assurance',
       'Advertising', 'Purchasing', 'Science', 'Health Care Provider',
       'Manufacturing', 'Strategy/Planning', 'Supply Chain', 'Research',
       'Distribution'], dtype=object)

In [41]:
data_class = data[['company_Link','ML Labeled Function']].copy()
data_class.head(3)

Unnamed: 0,company_Link,ML Labeled Function
0,"$5,000 Sign On Bonus* + $2,500 training pay + ...",Sales
1,Tenacious go-getter. Inquisitive problem solve...,Sales
2,"Spectrum Networks is looking for enthusiastic,...",Writing/Editing


In [42]:
data_class.isnull().sum()

company_Link           166
ML Labeled Function      0
dtype: int64

In [43]:
data_class = data_class.dropna()
data_class.isnull().sum()

company_Link           0
ML Labeled Function    0
dtype: int64

## Preprocessing the text

In [44]:
import re
from nltk.corpus import stopwords

special_character_remover = re.compile('[/(){}\[\]\|@,;]')
extra_symbol_remover = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = special_character_remover.sub(' ', text)
    text = extra_symbol_remover.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text
    
data_class['company_Link'] = data_class['company_Link'].apply(clean_text)
data_class.head(3)

Unnamed: 0,company_Link,ML Labeled Function
0,5 000 sign bonus + 2 500 training pay + unlimi...,Sales
1,tenacious gogetter inquisitive problem solver ...,Sales
2,spectrum networks looking enthusiastic talente...,Writing/Editing


## Splitting the Dataset for Training and Testing

In [45]:
from sklearn.model_selection import train_test_split
X = data_class['company_Link']
y = data_class['ML Labeled Function']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((2217,), (951,), (2217,), (951,))

In [46]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score

## Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

lr = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LogisticRegression()),
              ])

lr.fit(X_train,y_train)
y_predlr = lr.predict(X_test)

print(f"Accuracy is : {accuracy_score(y_predlr,y_test)}")

Accuracy is : 0.7917981072555205


## Stochastic Gradient Descent 

In [10]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

SGD = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', SGDClassifier()),
              ])

SGD.fit(X_train,y_train)
y_predsgd =SGD.predict(X_test)

print(f"Accuracy is : {accuracy_score(y_predsgd,y_test)}")

Accuracy is : 0.8496319663512093


## Support Vector

In [11]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

SVC = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', SVC()),
              ])

SVC.fit(X_train,y_train)
y_predsvc = SVC.predict(X_test)

print(f"Accuracy is : {accuracy_score(y_predsvc,y_test)}")

Accuracy is : 0.8180862250262881


## K Nearest Neighbour

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

KN = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', KNeighborsClassifier()),
              ])

KN.fit(X_train,y_train)
y_predkn = KN.predict(X_test)

print(f"Accuracy is : {accuracy_score(y_predkn,y_test)}")

Accuracy is : 0.7707676130389064


## Decision Tree

In [13]:
 from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

DecT = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', DecisionTreeClassifier()),
              ])

DecT.fit(X_train,y_train)
y_preddc = DecT.predict(X_test)

print(f"Accuracy is : {accuracy_score(y_preddc,y_test)}")

Accuracy is : 0.7791798107255521


## Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

RandomForest = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', RandomForestClassifier()),
              ])

RandomForest.fit(X_train,y_train)
y_predRF = RandomForest.predict(X_test)

print(f"Accuracy of Random_Forest : {accuracy_score(y_predRF,y_test)}")

Accuracy of Random_Forest : 0.8138801261829653


## Gradient Boosting

In [16]:
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

GradientBoosting = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', GradientBoostingClassifier()),
              ])

GradientBoosting.fit(X_train,y_train)
y_predGB = GradientBoosting.predict(X_test)

print(f"Accuracy of Gradient_Boosting : {accuracy_score(y_predGB,y_test)}")

Accuracy of Gradient_Boosting : 0.8296529968454258


## Naive Bayes Classifier

In [22]:
from sklearn.naive_bayes import MultinomialNB


naivebayes = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
naivebayes.fit(X_train, y_train)

y_pred = naivebayes.predict(X_test)

print(f'accuracy {accuracy_score(y_pred,y_test)}')

accuracy 0.6340694006309149


## Xgboost Classifier

In [24]:
from xgboost import XGBClassifier

xgboost = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', XGBClassifier()),
              ])
xgboost.fit(X_train, y_train)

y_pred = xgboost.predict(X_test)

print(f'accuracy {accuracy_score(y_pred,y_test)}')



accuracy 0.8496319663512093


## LightGBM Classifier

In [31]:
from lightgbm import LGBMClassifier

lgb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LGBMClassifier()),
              ])
lgb.fit(X_train, y_train)

y_predlgb = lgb.predict(X_test)

print(f'accuracy {accuracy_score(y_predlgb,y_test)}')

accuracy 0.8548895899053628


## CatBoost Classifier

In [38]:
from catboost import CatBoostClassifier

cb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', CatBoostClassifier()),
              ])
cb.fit(X_train, y_train)

y_predcb = cb.predict(X_test)

print(f'accuracy {accuracy_score(y_predcb,y_test)}')

Learning rate set to 0.082406
0:	learn: 2.8164407	total: 6.57s	remaining: 1h 49m 26s
1:	learn: 2.5797973	total: 13.9s	remaining: 1h 55m 53s
2:	learn: 2.3793181	total: 20.2s	remaining: 1h 52m 8s
3:	learn: 2.2328781	total: 27s	remaining: 1h 52m 3s
4:	learn: 2.1176470	total: 34.4s	remaining: 1h 54m 7s
5:	learn: 2.0209568	total: 41.6s	remaining: 1h 54m 46s
6:	learn: 1.9263210	total: 48.4s	remaining: 1h 54m 26s
7:	learn: 1.8477285	total: 56.1s	remaining: 1h 55m 59s
8:	learn: 1.7816191	total: 1m 2s	remaining: 1h 55m 28s
9:	learn: 1.7209184	total: 1m 10s	remaining: 1h 55m 57s
10:	learn: 1.6602339	total: 1m 17s	remaining: 1h 56m 15s
11:	learn: 1.6095349	total: 1m 24s	remaining: 1h 56m 6s
12:	learn: 1.5680347	total: 1m 32s	remaining: 1h 56m 56s
13:	learn: 1.5260029	total: 1m 39s	remaining: 1h 56m 39s
14:	learn: 1.4870268	total: 1m 46s	remaining: 1h 56m 43s
15:	learn: 1.4517120	total: 1m 54s	remaining: 1h 56m 59s
16:	learn: 1.4175150	total: 2m 1s	remaining: 1h 56m 48s
17:	learn: 1.3874709	total:

143:	learn: 0.6501070	total: 18m 34s	remaining: 1h 50m 27s
144:	learn: 0.6489780	total: 18m 43s	remaining: 1h 50m 24s
145:	learn: 0.6478821	total: 18m 52s	remaining: 1h 50m 23s
146:	learn: 0.6470678	total: 19m	remaining: 1h 50m 20s
147:	learn: 0.6443738	total: 19m 9s	remaining: 1h 50m 17s
148:	learn: 0.6431466	total: 19m 18s	remaining: 1h 50m 14s
149:	learn: 0.6415409	total: 19m 26s	remaining: 1h 50m 11s
150:	learn: 0.6384187	total: 19m 35s	remaining: 1h 50m 8s
151:	learn: 0.6374898	total: 19m 43s	remaining: 1h 50m
152:	learn: 0.6358716	total: 19m 51s	remaining: 1h 49m 57s
153:	learn: 0.6349972	total: 19m 59s	remaining: 1h 49m 49s
154:	learn: 0.6344116	total: 20m 7s	remaining: 1h 49m 41s
155:	learn: 0.6337497	total: 20m 15s	remaining: 1h 49m 33s
156:	learn: 0.6332361	total: 20m 23s	remaining: 1h 49m 28s
157:	learn: 0.6311411	total: 20m 31s	remaining: 1h 49m 20s
158:	learn: 0.6305975	total: 20m 38s	remaining: 1h 49m 12s
159:	learn: 0.6289812	total: 20m 46s	remaining: 1h 49m 4s
160:	lear

283:	learn: 0.5127408	total: 37m 9s	remaining: 1h 33m 39s
284:	learn: 0.5123528	total: 37m 16s	remaining: 1h 33m 31s
285:	learn: 0.5115046	total: 37m 24s	remaining: 1h 33m 23s
286:	learn: 0.5113161	total: 37m 32s	remaining: 1h 33m 15s
287:	learn: 0.5106745	total: 37m 40s	remaining: 1h 33m 7s
288:	learn: 0.5102563	total: 37m 47s	remaining: 1h 32m 59s
289:	learn: 0.5098857	total: 37m 55s	remaining: 1h 32m 51s
290:	learn: 0.5096494	total: 38m 3s	remaining: 1h 32m 44s
291:	learn: 0.5093773	total: 38m 11s	remaining: 1h 32m 36s
292:	learn: 0.5081529	total: 38m 19s	remaining: 1h 32m 28s
293:	learn: 0.5074313	total: 38m 27s	remaining: 1h 32m 21s
294:	learn: 0.5072281	total: 38m 35s	remaining: 1h 32m 13s
295:	learn: 0.5069218	total: 38m 43s	remaining: 1h 32m 5s
296:	learn: 0.5066682	total: 38m 50s	remaining: 1h 31m 57s
297:	learn: 0.5056991	total: 38m 58s	remaining: 1h 31m 49s
298:	learn: 0.5055356	total: 39m 7s	remaining: 1h 31m 42s
299:	learn: 0.5053113	total: 39m 14s	remaining: 1h 31m 34s
30

423:	learn: 0.4420086	total: 55m 32s	remaining: 1h 15m 26s
424:	learn: 0.4418004	total: 55m 40s	remaining: 1h 15m 19s
425:	learn: 0.4416350	total: 55m 48s	remaining: 1h 15m 11s
426:	learn: 0.4414241	total: 55m 55s	remaining: 1h 15m 3s
427:	learn: 0.4402201	total: 56m 3s	remaining: 1h 14m 55s
428:	learn: 0.4399288	total: 56m 11s	remaining: 1h 14m 47s
429:	learn: 0.4396531	total: 56m 18s	remaining: 1h 14m 39s
430:	learn: 0.4393258	total: 56m 26s	remaining: 1h 14m 31s
431:	learn: 0.4391034	total: 56m 34s	remaining: 1h 14m 23s
432:	learn: 0.4381539	total: 56m 42s	remaining: 1h 14m 15s
433:	learn: 0.4374359	total: 56m 50s	remaining: 1h 14m 7s
434:	learn: 0.4372904	total: 56m 57s	remaining: 1h 13m 59s
435:	learn: 0.4371781	total: 57m 5s	remaining: 1h 13m 51s
436:	learn: 0.4367945	total: 57m 13s	remaining: 1h 13m 43s
437:	learn: 0.4356395	total: 57m 21s	remaining: 1h 13m 36s
438:	learn: 0.4354282	total: 57m 29s	remaining: 1h 13m 28s
439:	learn: 0.4351982	total: 57m 37s	remaining: 1h 13m 20s
4

563:	learn: 0.3978552	total: 1h 13m 56s	remaining: 57m 9s
564:	learn: 0.3976716	total: 1h 14m 4s	remaining: 57m 2s
565:	learn: 0.3974805	total: 1h 14m 12s	remaining: 56m 54s
566:	learn: 0.3973150	total: 1h 14m 20s	remaining: 56m 46s
567:	learn: 0.3970703	total: 1h 14m 28s	remaining: 56m 38s
568:	learn: 0.3969107	total: 1h 14m 35s	remaining: 56m 30s
569:	learn: 0.3962872	total: 1h 14m 43s	remaining: 56m 22s
570:	learn: 0.3961792	total: 1h 14m 51s	remaining: 56m 14s
571:	learn: 0.3959031	total: 1h 15m	remaining: 56m 7s
572:	learn: 0.3957161	total: 1h 15m 7s	remaining: 55m 59s
573:	learn: 0.3955141	total: 1h 15m 15s	remaining: 55m 51s
574:	learn: 0.3954178	total: 1h 15m 23s	remaining: 55m 43s
575:	learn: 0.3952892	total: 1h 15m 31s	remaining: 55m 35s
576:	learn: 0.3951403	total: 1h 15m 39s	remaining: 55m 28s
577:	learn: 0.3950571	total: 1h 15m 47s	remaining: 55m 20s
578:	learn: 0.3948475	total: 1h 15m 55s	remaining: 55m 12s
579:	learn: 0.3946078	total: 1h 16m 2s	remaining: 55m 4s
580:	lea

703:	learn: 0.3625923	total: 1h 32m 17s	remaining: 38m 48s
704:	learn: 0.3624959	total: 1h 32m 25s	remaining: 38m 40s
705:	learn: 0.3622949	total: 1h 32m 32s	remaining: 38m 32s
706:	learn: 0.3622107	total: 1h 32m 40s	remaining: 38m 24s
707:	learn: 0.3620843	total: 1h 32m 48s	remaining: 38m 16s
708:	learn: 0.3618988	total: 1h 32m 55s	remaining: 38m 8s
709:	learn: 0.3614814	total: 1h 33m 3s	remaining: 38m
710:	learn: 0.3613691	total: 1h 33m 11s	remaining: 37m 52s
711:	learn: 0.3612395	total: 1h 33m 19s	remaining: 37m 44s
712:	learn: 0.3611156	total: 1h 33m 27s	remaining: 37m 37s
713:	learn: 0.3610097	total: 1h 33m 35s	remaining: 37m 29s
714:	learn: 0.3608984	total: 1h 33m 43s	remaining: 37m 21s
715:	learn: 0.3607163	total: 1h 33m 51s	remaining: 37m 13s
716:	learn: 0.3606267	total: 1h 33m 58s	remaining: 37m 5s
717:	learn: 0.3603655	total: 1h 34m 6s	remaining: 36m 57s
718:	learn: 0.3591883	total: 1h 34m 14s	remaining: 36m 49s
719:	learn: 0.3590699	total: 1h 34m 22s	remaining: 36m 42s
720:	

843:	learn: 0.3329805	total: 1h 50m 36s	remaining: 20m 26s
844:	learn: 0.3328876	total: 1h 50m 44s	remaining: 20m 18s
845:	learn: 0.3327411	total: 1h 50m 52s	remaining: 20m 10s
846:	learn: 0.3326661	total: 1h 50m 59s	remaining: 20m 2s
847:	learn: 0.3325853	total: 1h 51m 7s	remaining: 19m 55s
848:	learn: 0.3325050	total: 1h 51m 15s	remaining: 19m 47s
849:	learn: 0.3324034	total: 1h 51m 23s	remaining: 19m 39s
850:	learn: 0.3322262	total: 1h 51m 31s	remaining: 19m 31s
851:	learn: 0.3321332	total: 1h 51m 38s	remaining: 19m 23s
852:	learn: 0.3320574	total: 1h 51m 46s	remaining: 19m 15s
853:	learn: 0.3316574	total: 1h 51m 54s	remaining: 19m 7s
854:	learn: 0.3314883	total: 1h 52m 2s	remaining: 19m
855:	learn: 0.3313115	total: 1h 52m 10s	remaining: 18m 52s
856:	learn: 0.3310542	total: 1h 52m 18s	remaining: 18m 44s
857:	learn: 0.3309700	total: 1h 52m 25s	remaining: 18m 36s
858:	learn: 0.3306936	total: 1h 52m 33s	remaining: 18m 28s
859:	learn: 0.3305797	total: 1h 52m 41s	remaining: 18m 20s
860:	

986:	learn: 0.3090801	total: 2h 9m 19s	remaining: 1m 42s
987:	learn: 0.3089986	total: 2h 9m 26s	remaining: 1m 34s
988:	learn: 0.3089002	total: 2h 9m 34s	remaining: 1m 26s
989:	learn: 0.3088154	total: 2h 9m 42s	remaining: 1m 18s
990:	learn: 0.3086492	total: 2h 9m 50s	remaining: 1m 10s
991:	learn: 0.3085428	total: 2h 9m 58s	remaining: 1m 2s
992:	learn: 0.3084183	total: 2h 10m 6s	remaining: 55s
993:	learn: 0.3083172	total: 2h 10m 15s	remaining: 47.2s
994:	learn: 0.3082109	total: 2h 10m 23s	remaining: 39.3s
995:	learn: 0.3077587	total: 2h 10m 31s	remaining: 31.4s
996:	learn: 0.3071269	total: 2h 10m 39s	remaining: 23.6s
997:	learn: 0.3069903	total: 2h 10m 46s	remaining: 15.7s
998:	learn: 0.3068586	total: 2h 10m 55s	remaining: 7.86s
999:	learn: 0.3067263	total: 2h 11m 2s	remaining: 0us
accuracy 0.8275499474237644


## Accuracies by models

| Model | Accuracy |
| --- | --- |
| Logistic Regression | 79.17% |
| Stochastic Gradient Descent | 84.96% |
| Support Vector | 81.80% |
| K Nearest Neighbour | 77.07% |
| Decision Tree | 77.91% |
| Random Forest | 81.38% |
| Gradient Boosting | 82.96% |
| Naive Bayes | 63.40% |
| Xgboost Classifier | 84.96% |
| LightGBM Classifier | 85.48% |
| CatBoost Classifier | 82.75% |