# Librairies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from collections import Counter
import spacy
from spacy import displacy
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize 
from nltk.tokenize import word_tokenize,MWETokenizer 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
# df=pd.read_csv('data/emails.csv')
# # df_class.drop('Unnamed: 0',inplace=True, axis=1)

In [3]:
df=pd.read_csv('data/job_emails1.csv')
df.head()

Unnamed: 0,index,Content,source,label1,label2
0,0,"Ginko The company\n\nAt Ginko, we are Digital ...",indeed,job_opp,data_engineer
1,1,Nexity Company Description\n\n1st platform of ...,indeed,job_opp,data_engineer
2,2,"42c Who are we ?\nIn agile mode, we provide an...",indeed,job_opp,data_engineer
3,3,"RS2i Who are we ?\nAt RS2i, we are committed t...",indeed,job_opp,data_engineer
4,4,McKinsey & Company Who You'll Work With\nBased...,indeed,job_opp,data_engineer


# Data Cleaning

In [4]:
df[df['source']=='assan'].isnull().sum()

index       0
Content     0
source      0
label1      0
label2     89
dtype: int64

In [5]:
df[df['source']=='indeed'].isnull().sum()

index      0
Content    0
source     0
label1     0
label2     0
dtype: int64

In [6]:
df[df['source']=='kaggle'].isnull().sum()

index        0
Content      0
source       0
label1     150
label2     150
dtype: int64

In [7]:
df.isnull().sum()

index        0
Content      0
source       0
label1     150
label2     239
dtype: int64

In [8]:
na_values = df[df.isnull().any(axis=1)]
na_values

Unnamed: 0,index,Content,source,label1,label2
131,131,Message-ID: <18782981.1075855378110.JavaMail.e...,kaggle,,
132,132,Message-ID: <15464986.1075855378456.JavaMail.e...,kaggle,,
133,133,Message-ID: <24216240.1075855687451.JavaMail.e...,kaggle,,
134,134,Message-ID: <13505866.1075863688222.JavaMail.e...,kaggle,,
135,135,Message-ID: <30922949.1075863688243.JavaMail.e...,kaggle,,
...,...,...,...,...,...
414,414,Subject: Eurofins USA BioPharma Services is hi...,assan,alert,
415,415,Subject: Your application for Lead Data Scient...,assan,applied,
417,417,"8/24/22, 7:21 PM\n\nGmail - Last 100 Seats Rem...",assan,other,
418,418,Subject: SENIOR DATA SCIENTIST opening at SHOP...,assan,alert,


In [9]:
print(na_values['label1'].isnull().sum(),na_values['label2'].isnull().sum())

150 239


In [10]:
df['label1'].unique()

array(['job_opp', nan, 'alert', 'other', 'missing', 'link', 'applied'],
      dtype=object)

In [11]:
def process_emails_type(label):
#     label = label.lower()
    if label =='job_opp':
        label = 'job_opp'
    elif label == 'alert':
        label = 'alert'
    else:
        label = 'other'
    return (label)

In [12]:
df['label'] = df['label1'].apply(process_emails_type)

In [13]:
df['label'].value_counts()

other      193
job_opp    181
alert       47
Name: label, dtype: int64

In [14]:
df['source'].value_counts()

kaggle    150
assan     140
indeed    131
Name: source, dtype: int64

# Remove punctuation and stop words
Stopwords include: I, he, she, and, but, was were, being, have, etc, which do not add meaning to the data. So these words must be removed which helps to reduce the features from our data. These are removed after tokenizing the text.

In [16]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
# text_tok = word_tokenize(df.Body_doc[0])

In [18]:
def remove_punct(text):
    """
    Remove punctuation from text
    input: text
    output: text
    """
    text_tok = word_tokenize(text)
    l=[]
    for word in text_tok:
        if not word in (string.punctuation):
            l.append(word)
    return l

In [20]:
df.Body_doc = df.Content.apply(remove_punct)

  df.Body_doc = df.Content.apply(remove_punct)


In [21]:
df.Body_doc

0      [Ginko, The, company, At, Ginko, we, are, Digi...
1      [Nexity, Company, Description, 1st, platform, ...
2      [42c, Who, are, we, In, agile, mode, we, provi...
3      [RS2i, Who, are, we, At, RS2i, we, are, commit...
4      [McKinsey, Company, Who, You, 'll, Work, With,...
                             ...                        
416    [Subject, Full, stack, Java, Developer, in, At...
417    [8/24/22, 7:21, PM, Gmail, Last, 100, Seats, R...
418    [Subject, SENIOR, DATA, SCIENTIST, opening, at...
419    [Subject, Handshake, Following, Up, Assan, SAN...
420    [Subject, Fulltime, Data, Engineer, ||, Dallas...
Name: Content, Length: 421, dtype: object

In [22]:
def remove_stopword(text):
    stop=set(stopwords.words('english'))
#     text_tok = word_tokenize(text)
    l=[]
    for word in text:
        if not word in stop:
            l.append(word)
    resultat=" ".join(l) 
    return resultat

In [23]:
df.Body_doc = df.Body_doc.apply(remove_stopword)

In [24]:
df.Body_doc

0      Ginko The company At Ginko Digital specialists...
1      Nexity Company Description 1st platform real e...
2      42c Who In agile mode provide answers expectat...
3      RS2i Who At RS2i committed achieving digital t...
4      McKinsey Company Who You 'll Work With Based P...
                             ...                        
416    Subject Full stack Java Developer Atlanta GA| ...
417    8/24/22 7:21 PM Gmail Last 100 Seats Remaining...
418    Subject SENIOR DATA SCIENTIST opening SHOPIFY ...
419    Subject Handshake Following Up Assan SANOGO Fr...
420    Subject Fulltime Data Engineer || Dallas TX Re...
Name: Content, Length: 421, dtype: object

# Tokenization with  spacy

In [25]:
nlp = spacy.load('en_core_web_md')
# tokens = nlp(words)

In [26]:
def nlp_spacy(text):
    tokens=[]
    entities=[]
    for sentence in range(len(text)):
        resultat=nlp(text[sentence])
        entity = resultat.ents
        tokens.append(resultat) 
        entities.append(entity) 
    
    return tokens,entities

In [27]:
tokens,entitie = nlp_spacy(df.Body_doc)

In [36]:
job_description = df[df['label1']== 'job_opp']
job_description.shape

(181, 6)

In [40]:
# X_Data = job_description["Content"]
# Y_Data = job_description["job_title"]
X_Data = job_description["Content"]
Y_Data = job_description["label2"]

cv = CountVectorizer()
X_Data = cv.fit_transform(X_Data)
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X_Data, Y_Data,test_size=0.3,random_state=101)
model = tree.DecisionTreeClassifier()
model.fit(X_Train,Y_Train)
predicted = model.predict(X_Test)
print(classification_report(Y_Test, predicted))

                   precision    recall  f1-score   support

Bigdata Developer       0.00      0.00      0.00         1
    data_engineer       0.96      1.00      0.98        43
   data_scientist       1.00      0.14      0.25         7
             link       0.00      0.00      0.00         0
            other       0.38      0.75      0.50         4

         accuracy                           0.85        55
        macro avg       0.47      0.38      0.35        55
     weighted avg       0.90      0.85      0.83        55



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
# tree.plot_tree(model)