In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('Company_Dataset/dataset_ml_labeled.csv')
data.head(3)

Unnamed: 0,Company,job_name,job_link,ML Labeled Function,city,country,Function,employment_type,remote,seniority level,Job Status,Date Reviewed,data analyst,company_Link,job_location,job_details,job_id,posting_error,description
0,Spectrum,"Outside Sales Representative | $5,000 Sign On ...",https://sjobs.brassring.com/TGnewUI/Search/hom...,Sales,Opelika,United States,Full-time,No,Mid-Senior Level,to be reviewed,,,,"$5,000 Sign On Bonus* + $2,500 training pay + ...",d74c82fb-27f8-435c-b54b-14ebccc7e9cd,,https://www.smartrecruiters.com/Humanity/74399...,,
1,Spectrum,Advertising Account Executive- New Business,https://sjobs.brassring.com/TGnewUI/Search/hom...,Sales,Bay City,United States,Full-time,No,Mid-Senior Level,to be reviewed,,,,Tenacious go-getter. Inquisitive problem solve...,11a21cd7-86c2-48a4-96b9-83fc59e428c1,,https://www.smartrecruiters.com/Humanity/74399...,,
2,Spectrum,"Editor, Media Ingest - Spectrum News Raleigh",https://sjobs.brassring.com/TGnewUI/Search/hom...,Writing/Editing,Raleigh,United States,Full-time,No,Mid-Senior Level,to be reviewed,,,,"Spectrum Networks is looking for enthusiastic,...",8e3143f9-e449-49fa-b2b7-a8fd30234c82,,https://www.smartrecruiters.com/Humanity/74399...,,


In [2]:
data['ML Labeled Function'].unique()

array(['Sales', 'Writing/Editing', 'Engineering',
       'Information Technology', 'Analyst', 'Human Resources',
       'Customer Service', 'Project Management', 'Accounting/Auditing',
       'Management', 'Production', 'Training', 'Public Relations',
       'Product Management', 'Design', 'Education', 'Finance',
       'General Business', 'Business Development', 'Marketing',
       'Administrative', 'Art/Creative', 'Legal', 'Quality Assurance',
       'Advertising', 'Purchasing', 'Science', 'Health Care Provider',
       'Manufacturing', 'Strategy/Planning', 'Supply Chain', 'Research',
       'Distribution'], dtype=object)

In [3]:
data_class = data[['company_Link','ML Labeled Function']].copy()
data_class.head(3)

Unnamed: 0,company_Link,ML Labeled Function
0,"$5,000 Sign On Bonus* + $2,500 training pay + ...",Sales
1,Tenacious go-getter. Inquisitive problem solve...,Sales
2,"Spectrum Networks is looking for enthusiastic,...",Writing/Editing


In [4]:
data_class.isnull().sum()

company_Link           166
ML Labeled Function      0
dtype: int64

In [6]:
data_class = data_class.dropna()
data_class.isnull().sum()

company_Link           0
ML Labeled Function    0
dtype: int64

## Preprocessing the text

In [7]:
import re
from nltk.corpus import stopwords

special_character_remover = re.compile('[/(){}\[\]\|@,;]')
extra_symbol_remover = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = special_character_remover.sub(' ', text)
    text = extra_symbol_remover.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text
    
data_class['company_Link'] = data_class['company_Link'].apply(clean_text)
data_class.head(3)

Unnamed: 0,company_Link,ML Labeled Function
0,5 000 sign bonus + 2 500 training pay + unlimi...,Sales
1,tenacious gogetter inquisitive problem solver ...,Sales
2,spectrum networks looking enthusiastic talente...,Writing/Editing


## Splitting the Dataset for Training and Testing

In [8]:
from sklearn.model_selection import train_test_split
X = data_class['company_Link']
y = data_class['ML Labeled Function']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((2217,), (951,), (2217,), (951,))

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score

## Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

lr = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LogisticRegression()),
              ])

lr.fit(X_train,y_train)
y_pred1 = lr.predict(X_test)

print(f"Accuracy is : {accuracy_score(y_pred1,y_test)}")

Accuracy is : 0.7917981072555205


## Naive Bayes Classifier

In [12]:
from sklearn.naive_bayes import MultinomialNB


naivebayes = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
naivebayes.fit(X_train, y_train)

y_pred = naivebayes.predict(X_test)

print(f'accuracy {accuracy_score(y_pred,y_test)}')

accuracy 0.6340694006309149


## Xgboost Classifier

In [15]:
from xgboost import XGBClassifier

xgboost = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', XGBClassifier()),
              ])
xgboost.fit(X_train, y_train)

y_pred = xgboost.predict(X_test)

print(f'accuracy {accuracy_score(y_pred,y_test)}')



accuracy 0.8496319663512093


## Accuracies by models

| Model | Accuracy |
| --- | --- |
| Logistic Regression | 79.2% |
| Naive Bayes | 63.4% |
| Xgboost Classifier | 84.9% |