<h1 align="center">
(Multi-text Text Classification Task)
</h1>



<h3>
By: Youssef Ahmed Hassan
</h3>

## Imports

In [42]:
from flask import Flask, jsonify, request, render_template, redirect, jsonify
from flask_cors import CORS, cross_origin
from sklearn.model_selection import KFold
from sklearn import naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
import re

In [2]:
np.random.seed(1) #for same output 

## Read, Shuffle and Visualize some of the Training data

In [3]:
_data = pd.read_csv('Job titles and industries.csv')
_data= _data.sample(frac=1).reset_index(drop=True) #Shuffle
_data.head()

Unnamed: 0,job title,industry
0,it support,IT
1,junior .net developer,IT
2,graduate software engineer,IT
3,sports coach in coventry,Education
4,1:1 sen ta / asd ta / lsa needed in ealing - a...,Education


## Pre-Processing of the Data
### Tokenization and Text Normalization

In [4]:
# Tokenization function

# we should remove any salary "number" value since it does not have a majority for certain class
# we should replace - and , with white space 
# we should make sure that all the words are smalll letteres 
# we don't need to reomve all special symbols like # because it may be related to the class like c# -> IT
# we will use regular expressions 

def pre_processing(text):
    new_text = re.sub(r'[?|$|!|&|*|.|£|(|)]',r'',text) #  replace ? ! £ & * $ ()  . with nothing
    new_text = re.sub(r'[,|/|\|-]',r' ',new_text) # replace , / \ - with space
    new_text = re.sub(r'[0-9]',r'',new_text) # remove numbers
    new_text = new_text.lower() # small letters only
    
    return new_text

In [5]:

for i in range (len(_data['job title'])):
        
    _data['job title'][i] = pre_processing(_data['job title'][i])
    

## Split Data into Train, and Validate (Test) Data

In [6]:
# K-fold cross validation
# we don't need K-fold here since the size of the data we have is not to big

'''
X = _data['job title']
Y = _data['industry']
kf = KFold(n_splits=10) # Define the split into 10 folds 
no_of_folds = kf.get_n_splits()
for train_index, test_index in kf.split(_data):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    
print(no_of_folds)

'''

# normal split ration .2 to validate then we will retrain with the full data
Train_X, Test_X, Train_y, Test_y = train_test_split(_data['job title'],_data['industry'],test_size=0.2,shuffle=False)


In [7]:
# seperate all the text(input) from the classes(output) 
it_df = Train_X[Train_y == "IT"]
marketing_df = Train_X[Train_y  == "Marketing"]
education_df = Train_X[Train_y  == "Education"]
accountancy_df = Train_X[Train_y  == "Accountancy"]

In [8]:
it_df.head()

0                                           it support
1                                 junior net developer
2                           graduate software engineer
6    full stack php developer   immense variety of ...
7                                           head of it
Name: job title, dtype: object

## The following Cells is just for more insights about the data..

In [9]:
# this cell for all the vocab 
all_vocab = set()

for line in Train_X:
    
    text_in_line = line.split() # split the words by white spaces
    
    for word in text_in_line:
        all_vocab.add(word)
        

In [10]:
print(all_vocab)
print(len(all_vocab))

{'psychology', 'harlow', 'erp', 'immense', 'global', 'transition', 'aws', 'alexandria', 'nasr', 'help', 'coupar', 'ssrs', 'medical', 'residential', 'qube', 'chain', 'autistic', 'invoicing', 'digital', 'adwords', 'unique', 'moga', 'jnr', 'snr', 'all', 'cardiff', 'c#', 'implementer', 'communcations', 'build', 'internship', 'games', 'trainer', 'blog', 'azure', 'script', 'city', 'amazon', 'wm', 'northfield', 'care', 'manual', 'kharar', 'staffordshire', 'ilford', 'into', 'materials', 'api', 'edge', 'psychologist', 'hove', 'commodities', 'producer', 'bonus', 'robotics', 'newcastle', 'cissp', 'remote', 'ranorex', 'migration', 'franchise', 'lab', 'tdd', 'aspiring', 'ebd', 'efl', 'electrical', 'motorsport', 'italian', 'vuejs', 'entry', 'poole', 'coordination', 'ecommerce', 'platforms', 'less', 'jc', 'midday', 'intelligence', 'vip', 'de', 'arabia', 'sw', 'geography', 'ci', 'immigration', 'vollzeit', 'middleweight', 'juniper', 'pen', 'conversion', 'evaluation', 'first', 'onboarding', 'hours', 'ab

In [11]:
# this cell for IT vocab 
it_vocab = set()

for line in it_df:
    text_in_line = line.split() # split the words by white spaces
    
    for word in text_in_line:
        it_vocab.add(word)

In [12]:
print(it_vocab)
print(len(it_vocab))

{'connectivity', 'erp', 'immense', 'pmo', 'global', 'transition', 'design', 'consultants', 'canteen', 'order', 'workplace', 'aws', 'danish', 'retail', 'associate', 'turkish', 'usability', 'term', 'help', 'management', 'unix', 'sc', 'test', 'ssrs', 'controls', 'eligible', 'access', 'platform', 'medical', 'spectroscopy', 'vendor', 'qube', 'revenue', 'seo', 'reliability', 'digital', 'unique', 'jnr', 'snr', 'all', 'soa', 'cardiff', 'coordinator', 'citrix', 'sci', 'c#', 'technical', 'build', 'market', 'internship', 'games', 'travel', 'datacentre', 'trainer', 'azure', 'planning', 'clinical', 'powershell', 'script', 'city', 'sccm', 'wm', 'camden', 'manual', 'simulation', 'culture', 'servicenow', 'york', 'good', 'into', 'multi', 'api', 'level', 'elearning', 'edge', 'educational', 'manufacturing', 'sector', 'ccna', 'db', 'eu', 'studio', 'bonus', 'technician', 'robotics', 'newcastle', 'architect', 'js', 'duty', 'engineer', 'analyst:', 'cissp', 'remote', 't', 'cisa', 'ranorex', 'migration', 'fren

In [13]:
# this cell for marketing vocab 
marketing_vocab = set()

for line in marketing_df:
    text_in_line = line.split() # split the words by white spaces
    
    for word in text_in_line:
        marketing_vocab.add(word)

In [14]:
print(marketing_vocab)
print(len(marketing_vocab))

{'law', 'pubic', 'global', 'editor', 'restaurants', 'design', 'consultants', 'restructuring', 'region', 'retail', 'associate', 'alexandria', 'manufacturer', 'ceo', 'term', 'help', 'management', 'graphic', 'bramfeld', 'relations', 'medical', 'eco', 'residential', 'person', 'agents', 'kassel', 'chain', 'seo', 'scale', 'digital', 'governmental', 'adwords', 'deep', 'buying', 'accounts', 'ophthalmology', 'jnr', 'all', 'coordinator', 'implementer', 'technical', 'communcations', 'market', 'internship', 'travel', 'bd', 'blog', 'hamburg', 'planning', 'city', 'fitness', 'amazon', 'camden', 'care', 'manual', 'fmcg', 'protection', 'kantar', 'sunninghill', 'into', 'level', 'technologist', 'commodities', 'sector', 'onsite', 'producer', 'eu', 'studio', 'teilzeit', 'bonus', 'paddy', 'telesales', 'scientific', 'this', 'irmonerstraße', 'engineer', 'coordnator', 'ppc', 'portfolio', 't', 'french', 'call', 'franchise', 'speaker', 'electrical', 'opportunity', 'to', 'activities', 'direct', 'telemarketing', '

In [15]:
# this cell for education vocab 
education_vocab = set()

for line in education_df:
    text_in_line = line.split() # split the words by white spaces
    
    for word in text_in_line:
        education_vocab.add(word)

In [16]:
print(education_vocab)
print(len(education_vocab))

{'primaire', 'psychology', 'harlow', 'daily', 'editor', 'global', 'design', 'additional', 'liverpool', 'childhood', 'station', 'mental', 'associate', 'nasr', 'alexandria', 'ceo', 'term', 'personnel', 'esl', 'help', 'coupar', 'educator', 'headteacher', 'graphic', 'australian', 'resistant', 'controls', 'wrexham', 'warwickshire', 'workers', 'residential', 'dubai', "supervisor's", 'autistic', 'digital', 'teachi', 'moga', 'kg', 'all', 'coordinator', 'invigilator', 'alperton', 'technical', 'gants', 'internship', 'bd', 'travel', 'we’re', 'rc', 'trainer', 'elementary', 'city', 'northfield', 'camden', 'care', 'manual', 'kharar', 'welding', 'staffordshire', 'ilford', 'education', 'materials', 'level', 'swinton', 'turkey', 'educational', 'facilitator', 'psychologist', 'hove', 'tweedbank', 'haslemere', 'cheltenham', 'baxenden', 'coach', 'technician', 'robotics', 'telesales', 'physical', 'hill', 'engineer', 'officer:', 'behavioral', 'mat', 'phase', 'tourism', 'french', 'call', 'lab', 'speaker', 'as

In [17]:
# this cell for accountancy vocab 
accountancy_vocab = set()

for line in accountancy_df:
    text_in_line = line.split() # split the words by white spaces
    
    for word in text_in_line:
        accountancy_vocab.add(word)

In [18]:
print(accountancy_vocab)
print(len(accountancy_vocab))

{'office', 'graduate', 'supervisor', 'with', 'accounting', 'asap', 'global', 'qualified', 'restructuring', 'liverpool', 'administration', 'track', 'full', 'bulgarian', 'allocator', 'order', 'would', 'aca', 'suit', 'retail', 'term', 'adminstrator', 'management', 'speaking', 'keeper', 'pricing', 'data', 'trainee', 'controls', 'exp', 'scheme', 'platform', 'lead', 'deputy', 'senior', 'position', 'australia', 'start', 'international', 'night', 'chesterfield', 'invoicing', 'representative', 'portuguese', 'audit', 'south', 'get', 'personal', 'london', 'sales', 'accounts', 'role', 'finance', 'purchase', 'dutch', 'acca', 'junior', 'coordinator', 'expatriate', 'collection', 'four', 'x', 'pay', 's', 'internship', 'assistant', 'training', 'ledger', 'immediate', 'contractors', 'harrogate', 'city', 'head', 'royalties', 'required', 'for', 'interim', 'commisions', 'practice', 'payroll', 'up', 'accountancy', 'protection', 'sub', 'accountant', 'purchasing', 'into', 'summer', 'multi', 'level', 'part', 's

## Try M.L Algorithms 
### Naive Bayes and SVM

In [19]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_y)
Test_Y = Encoder.fit_transform(Test_y)

#### 0-> Accountancy
#### 1-> Education 
#### 2-> IT
#### 3-> Marketing

In [20]:
encoded_to_label =  {0:"Accountancy",1:"Education",2:"IT",3:"Marketing"}

In [21]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',max_features=1820)
count_vect.fit(Train_X) # here we only fit the model with the dictionary of the training data
# since we do not to make the test(validate) data make any king on overfitting in the measure of 
# the performance.


# transform the training and validation data using count vectorizer object
x_train_count =  count_vect.transform(Train_X)
x_test_count =   count_vect.transform(Test_X)

### Naive Bayes

In [22]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(x_train_count,Train_Y)

# predict the labels on validation dataset
predictions_NB = Naive.predict(x_test_count)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  92.20023282887078


### SVM

In [23]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(x_train_count,Train_Y)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(x_test_count)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  93.0151338766007


### >> SVM gives us slightly better accuracy, but we will use Naive model since the difference is very small and Naive has the advantage of using the Priors Probability which are very important for example if the given example have no words in our vocab the model will classify based on priois only.

## Final Model (Naive)

In [24]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',max_features=1820)
count_vect.fit(_data['job title']) #now our dictionary is all the data 

# transform the training and validation data using count vectorizer object
X =  count_vect.transform(_data['job title'])


Encoder = LabelEncoder()
Y = Encoder.fit_transform(_data['industry'])


### Calculate Prior probabilities

In [25]:
it = _data['job title'][_data['industry'] == "IT"]
marketing = _data['job title'][_data['industry'] == "Marketing"]
education = _data['job title'][_data['industry']  == "Education"]
accountancy = _data['job title'][_data['industry'] == "Accountancy"]


n_it = len(it)
n_marketing = len(marketing)
n_education = len(education)
n_accountancy = len(accountancy)

n_totall = n_it + n_marketing + n_education + n_accountancy

print(n_it)
print(n_marketing)
print(n_education)
print(n_accountancy)

4746
2031
1435
374


In [26]:
p_it = n_it / n_totall
p_marketing = n_marketing / n_totall
p_education = n_education / n_totall
p_accountancy = n_accountancy / n_totall

#### 0-> Accountancy
#### 1-> Education 
#### 2-> IT
#### 3-> Marketing

priors = [p_accountancy , p_education , p_it , p_marketing]

In [27]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
#SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
#SVM.fit(x_train_count,Train_Y)


# fit all the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB(class_prior=priors)
Naive.fit(X,Y)


MultinomialNB(alpha=1.0,
       class_prior=[0.043559282552993246, 0.1671325413463778, 0.5527603074772887, 0.23654786862334032],
       fit_prior=True)

## Predict on sample examples

In [28]:
# Example 1
test_string = ["teaching assistant"]
Test_X_counts = count_vect.transform(test_string)
test_sample = Naive.predict(Test_X_counts)
job_industry = encoded_to_label[test_sample[0]]

print(job_industry)

Education


In [29]:
# Example 2 testing random input, should give us the class with higher probability
test_string = ["xxxzzzz"]
Test_X_counts = count_vect.transform(test_string)
test_sample = Naive.predict(Test_X_counts)
job_industry = encoded_to_label[test_sample[0]]

print(job_industry)

IT


## And Yes! IT is highest prior class, now we have a good model to go

## Using Flask to create RESTFul API

In [48]:
app = Flask(__name__) #define app
CORS(app)

@app.route('/',methods=['GET'])

def get_job_title():
    
    #Get the text and pre-process it
    #job_title_text = request.json['job_title']
    
    job_title_text = request.args.get('job_title')
    
    job_title_text = pre_processing(job_title_text)
    
    #predict using the SVM model
    test_string   = [job_title_text]
    Test_X_sample = count_vect.transform(test_string)
    test_sample   = Naive.predict(Test_X_sample)

    #mapping and sending the label
    job_industry = encoded_to_label[test_sample[0]]
    
    
    print('Job title:' + job_title_text)
    print('Job industry:' + job_industry)
    
    return jsonify({"job_industry": job_industry})
    
#run the server 
if __name__ == '__main__':
    app.run(port = 8080)


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8080/ (Press CTRL+C to quit)
127.0.0.1 - - [22/Jun/2019 22:21:20] "GET /?job_title=teacher%20assistant HTTP/1.1" 200 -


Job title:teacher assistant
Job industry:Education
