# VN_TEAS: Public procurement criteria assessment: Text analytics and machine learning approach

## Mapping all pdf files within tender folders and extract text content and put them into a dataframe

In [41]:
#install package (tika) required for pdf text parsing
conda install -c conda-forge tika

In [5]:
#import required packages
import os, glob
from tika import parser 
from pandas import DataFrame

In [44]:
# What file extension to find, and where to look from
ext = "*.pdf"
PATH = "/Users/aharash/Downloads/VN_TEAS_BIGDATA"

# Find all the files with that extension
files = []
for dirpath, dirnames, filenames in os.walk(PATH):
    files += glob.glob(os.path.join(dirpath, ext))

# Create a Pandas Dataframe to hold the filenames and the text
df = DataFrame(columns=("filename","text"))

# Process each file in turn, parsing with Tika and storing in the dataframe
for idx, filename in enumerate(files):
   data = parser.from_file(filename)
   text = data["content"]
   df.loc[idx] = [filename, text]

# For debugging, print what we found
df

Unnamed: 0,filename,text
0,/Users/aharash/Downloads/VN_TEAS_BIGDATA/16335...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
1,/Users/aharash/Downloads/VN_TEAS_BIGDATA/28281...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
2,/Users/aharash/Downloads/VN_TEAS_BIGDATA/28281...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
3,/Users/aharash/Downloads/VN_TEAS_BIGDATA/30391...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
4,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
...,...,...
7514,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
7515,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
7516,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...
7517,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...


In [45]:
#priliminary text cleaning
df['space_remove'] = df['text'].str.replace('\n',' ')
#convert to lower case
df['cleaned_text_lower'] = df['space_remove'].str.lower()
#change type to str
df['cleaned_text_lower']=df['cleaned_text_lower'].apply(str)
df

Unnamed: 0,filename,text,space_remove,cleaned_text_lower
0,/Users/aharash/Downloads/VN_TEAS_BIGDATA/16335...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...
1,/Users/aharash/Downloads/VN_TEAS_BIGDATA/28281...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...
2,/Users/aharash/Downloads/VN_TEAS_BIGDATA/28281...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...
3,/Users/aharash/Downloads/VN_TEAS_BIGDATA/30391...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...
4,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...
...,...,...,...,...
7514,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...
7515,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...
7516,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...
7517,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Microso...,microso...


## Text processing

In [46]:
#import required packages
import nltk
from nltk import word_tokenize, sent_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
nltk.download
import re

In [47]:
# Criteria finder
SME_Criteria = r'jaettu|tarjota\syhteen|tarjota\suseampaan|mikroyritys|pieni\stai\skeskisuuri\syritys|aliurakoitsjia|dynaaminen\stoimittajarekisteri|kevennetty\skilpailutus|minikilpailutus|tilata\ssuoraan|kevennetty\skilpailutus|kertahankinta|hankkia\suseammalta\stuottajalta'
df["SME_Criteria"] = df["cleaned_text_lower"].apply(lambda x: re.findall(SME_Criteria, x, flags=re.IGNORECASE))

Employment_Criteria= r'työllistämisvelvoite|työttömien|työllistymiseen|työllistämisehto|työllistäminen|hankinnoista\sduunia|heikossa\styömarkkina-asemassa|työllistymistä|palkkatukityöllistämisen\skuvaus|tarjoamaan\styöpaikan|työttömiksi\styönhakijoiksi|heikossa\styömarkkina-asemassa|oikeutettuja\spalkkatukeen|oppisopimuskoulutettavat|oppisopimuskoulutukseen|oppisopimuskoulutus|työllistämään|työttömiä|palkkatuki|työllistämisen|työttömiksi\styönhakijoiksi|palkkatukeen|heikossa\styömarkkina-asemassa|työllistäminen|tarjoamaan\styöpaikan|työttömiksi\styönhakijoiksi|oikeutettuja\spalkkatukeen|tarjota_työpaikkaa|heikossa\styömarkkina-asemassa|oikeutettu\spalkkatukeen|työllistämisvelvoite|työllistämistoimet|työttömäksi\styönhakijaksi|palkkatuki|työllistämisen|palkkatuki'
df["Employment_Criteria"] = df["cleaned_text_lower"].apply(lambda x: re.findall(Employment_Criteria, x, flags=re.IGNORECASE))

In [12]:
#df.SME_Criteria.value_counts()

In [48]:
df

Unnamed: 0,filename,text,space_remove,cleaned_text_lower,SME_Criteria,Employment_Criteria
0,/Users/aharash/Downloads/VN_TEAS_BIGDATA/16335...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...,"[mikroyritys, mikroyritys, pieni tai keskisuur...",[]
1,/Users/aharash/Downloads/VN_TEAS_BIGDATA/28281...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,[jaettu],[]
2,/Users/aharash/Downloads/VN_TEAS_BIGDATA/28281...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,"[jaettu, jaettu]",[]
3,/Users/aharash/Downloads/VN_TEAS_BIGDATA/30391...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...,"[mikroyritys, mikroyritys, pieni tai keskisuur...",[]
4,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...,"[mikroyritys, mikroyritys, pieni tai keskisuur...",[]
...,...,...,...,...,...,...
7514,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,[],[]
7515,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,[],[]
7516,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,[],[]
7517,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Microso...,microso...,[],[]


In [49]:
df.to_pickle("Final.pkl")

In [None]:
# Lableing

In [25]:
import pandas as pd
df = pd.read_pickle("Final.pkl")
df

Unnamed: 0,filename,text,space_remove,cleaned_text_lower,SME_Criteria,Employment_Criteria
0,/Users/aharash/Downloads/VN_TEAS_BIGDATA/16335...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...,"[mikroyritys, mikroyritys, pieni tai keskisuur...",[]
1,/Users/aharash/Downloads/VN_TEAS_BIGDATA/28281...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,[jaettu],[]
2,/Users/aharash/Downloads/VN_TEAS_BIGDATA/28281...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,"[jaettu, jaettu]",[]
3,/Users/aharash/Downloads/VN_TEAS_BIGDATA/30391...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...,"[mikroyritys, mikroyritys, pieni tai keskisuur...",[]
4,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...,"[mikroyritys, mikroyritys, pieni tai keskisuur...",[]
...,...,...,...,...,...,...
7514,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,[],[]
7515,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,[],[]
7516,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,[],[]
7517,/Users/aharash/Downloads/VN_TEAS_BIGDATA/25031...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Microso...,microso...,[],[]


In [26]:
df['SME_Criteria'] = df['SME_Criteria'].apply(lambda x: str(x).replace('[', ''))
df['SME_Criteria'] = df['SME_Criteria'].apply(lambda x: str(x).replace(']', ''))
df.loc[df.SME_Criteria == '', 'SME_Criteria'] = 'nan'

df['SME_Label']=df['SME_Criteria'].str.contains('nan')

bool_cols = df.columns[df.dtypes == 'bool']

df[bool_cols] = df[bool_cols].replace({True: 'Not_SME', False: 'Yes_SME'})
df.head(20)

Unnamed: 0,filename,text,space_remove,cleaned_text_lower,SME_Criteria,Employment_Criteria,SME_Label
0,/Users/aharash/Downloads/VN_TEAS_BIGDATA/16335...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...,"'mikroyritys', 'mikroyritys', 'pieni tai keski...",[],Yes_SME
1,/Users/aharash/Downloads/VN_TEAS_BIGDATA/28281...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,'jaettu',[],Yes_SME
2,/Users/aharash/Downloads/VN_TEAS_BIGDATA/28281...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,"'jaettu', 'jaettu'",[],Yes_SME
3,/Users/aharash/Downloads/VN_TEAS_BIGDATA/30391...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...,"'mikroyritys', 'mikroyritys', 'pieni tai keski...",[],Yes_SME
4,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...,"'mikroyritys', 'mikroyritys', 'pieni tai keski...",[],Yes_SME
5,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,,[],Not_SME
6,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,,[],Not_SME
7,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Mic...,mic...,,[],Not_SME
8,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,,[],Not_SME
9,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Micr...,micr...,,[],Not_SME


In [27]:
df['Employment_Criteria'] = df['Employment_Criteria'].apply(lambda x: str(x).replace('[', ''))
df['Employment_Criteria'] = df['Employment_Criteria'].apply(lambda x: str(x).replace(']', ''))
df.loc[df.Employment_Criteria == '', 'Employment_Criteria'] = 'nan'

df['Employment_Label']=df['Employment_Criteria'].str.contains('nan')

bool_cols = df.columns[df.dtypes == 'bool']
df[bool_cols] = df[bool_cols].replace({True: 'Not_Employment', False: 'Yes_Employment'})

df.head(20)

Unnamed: 0,filename,text,space_remove,cleaned_text_lower,SME_Criteria,Employment_Criteria,SME_Label,Employment_Label
0,/Users/aharash/Downloads/VN_TEAS_BIGDATA/16335...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...,"'mikroyritys', 'mikroyritys', 'pieni tai keski...",,Yes_SME,Not_Employment
1,/Users/aharash/Downloads/VN_TEAS_BIGDATA/28281...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,'jaettu',,Yes_SME,Not_Employment
2,/Users/aharash/Downloads/VN_TEAS_BIGDATA/28281...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,"'jaettu', 'jaettu'",,Yes_SME,Not_Employment
3,/Users/aharash/Downloads/VN_TEAS_BIGDATA/30391...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...,"'mikroyritys', 'mikroyritys', 'pieni tai keski...",,Yes_SME,Not_Employment
4,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Tarjou...,tarjou...,"'mikroyritys', 'mikroyritys', 'pieni tai keski...",,Yes_SME,Not_Employment
5,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,,,Not_SME,Not_Employment
6,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,,,Not_SME,Not_Employment
7,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Mic...,mic...,,,Not_SME,Not_Employment
8,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,...,...,,,Not_SME,Not_Employment
9,/Users/aharash/Downloads/VN_TEAS_BIGDATA/20404...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,Micr...,micr...,,,Not_SME,Not_Employment


In [28]:
df.to_pickle("Final_Label.pkl")

# Text transformation_SME

In [2]:
import pandas as pd
df = pd.read_pickle("Final_Label.pkl")

In [3]:
df = df.dropna()
df.shape

(6825, 8)

In [4]:
import pandas as pd
import re
import string
import nltk
#pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('finnish')
ps = nltk.PorterStemmer()

from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
#text cleaning function
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [42]:
# increase jupyter notebook memory resources if needed from command line
jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [7]:
# For featurizing all the text
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(df['cleaned_text_lower'])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names())

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [8]:
#transform bag of word to TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

#SME criteria words
words = ['jaettu', 'tarjota yhteen', 'tarjota useampaan', 'mikroyritys', 'pieni tai keskisuuri yritys', 'aliurakoitsjia', 'dynaaminen toimittajarekisteri', 'kevennetty kilpailutus', 'minikilpailutus', 'tilata suoraan', 'kertahankinta', 'hankkia useammalta tuottajalta']
TfidfVectorizer(vocabulary=words, stop_words = 'finnish', ngram_range=(1,4))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 4), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='finnish', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True,
                vocabulary=['jaettu', 'tarjota yhteen', 'tarjota useampaan',
                            'mikroyritys', 'pieni tai keskisuuri yritys',
                            'aliurakoitsjia', 'dynaaminen toimittajarekisteri',
                            'kevennetty kilpailutus', 'minikilpailutus',
                            'tilata suoraan', 'kertahankinta',
                            'hankkia useammalta tuottajalta'])

In [9]:
mod_tfidf = TfidfVectorizer(vocabulary=words, ngram_range=(1,4))
X_tfidf = mod_tfidf.fit_transform(words)

In [10]:
X2_tfidf = mod_tfidf.transform(df['cleaned_text_lower'])

In [12]:
#transform TFIDF to a dataframe
X_tfidf_df = pd.DataFrame(X2_tfidf.toarray())
X_tfidf_df.columns = mod_tfidf.get_feature_names()
X_tfidf_df
#X_tfidf_df.shape

Unnamed: 0,jaettu,tarjota yhteen,tarjota useampaan,mikroyritys,pieni tai keskisuuri yritys,aliurakoitsjia,dynaaminen toimittajarekisteri,kevennetty kilpailutus,minikilpailutus,tilata suoraan,kertahankinta,hankkia useammalta tuottajalta
0,0.0,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.707107,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6820,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6821,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6822,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6823,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Machine Learning_SME

In [13]:
#import packages
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_df, df['SME_Label'], test_size=0.2, random_state=0, stratify=df['SME_Label'])

In [16]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [17]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:20]

[(0.4137933374378314, 'jaettu'),
 (0.17250936782162302, 'mikroyritys'),
 (0.16934548557824033, 'kevennetty kilpailutus'),
 (0.13550360892524743, 'pieni tai keskisuuri yritys'),
 (0.06460049941146224, 'minikilpailutus'),
 (0.02187261227885441, 'kertahankinta'),
 (0.013980260078353278, 'tilata suoraan'),
 (0.005520087683610208, 'dynaaminen toimittajarekisteri'),
 (0.002874740784777652, 'tarjota yhteen'),
 (0.0, 'tarjota useampaan'),
 (0.0, 'hankkia useammalta tuottajalta'),
 (0.0, 'aliurakoitsjia')]

In [19]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='Yes_SME', average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

Precision: 0.994 / Recall: 0.941 / Accuracy: 0.991


In [21]:
# save the model to disk
import joblib
filename = 'finalized_MLmodel_SME.sav'
joblib.dump(rf_model, filename)

['finalized_MLmodel_SME.sav']

In [23]:
# load the model from disk
loaded_model = joblib.load('finalized_MLmodel_SME.sav')
result = loaded_model.score(X_test, y_test)
print(result)

0.9912087912087912


# Text transformation_Employment

In [24]:
import pandas as pd
df = pd.read_pickle("Final_Label.pkl")

In [25]:
df = df.dropna()
df.shape

(6825, 8)

In [26]:
import pandas as pd
import re
import string
import nltk
#pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('finnish')
ps = nltk.PorterStemmer()

from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
#text cleaning function
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [28]:
#transform bag of word to TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

#Employment criteria words
words = ['palkkatuki', 'palkkatukityöllistämisen kuvaus', 'työllistämisen', 'työllistäminen', 'heikossa työmarkkina-asemassa', 'työllistämistoimet', 'tarjoamaan työpaikan', 'oppisopimuskoulutukseen', 'työllistämään', 'oikeutettuja palkkatukeen', 'työttömiä', 'työttömiksi työnhakijoiksi', 'oikeutettu palkkatukeen', 'työttömäksi työnhakijaksi', 'hankinnoista duunia', 'työllistymistä', 'oppisopimuskoulutus', 'työllistämisehto', 'työllistämisvelvoite', 'oppisopimuskoulutettavat', 'palkkatukeen']
TfidfVectorizer(vocabulary=words, stop_words = 'finnish', ngram_range=(1,4))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 4), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='finnish', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=Non...
                            'heikossa työmarkkina-asemassa',
                            'työllistämistoimet', 'tarjoamaan työpaikan',
                            'oppisopimuskoulutukseen', 'työllistämään',
                            'oikeutettuja palkkatukeen', 'työttömiä',
                            'työttömiksi työnhakijoiksi',
                            'oikeutettu palkkatukeen',
                            'työttömäksi työnhakijaksi', 'hankinnoista duunia',
                            'työllistymistä', 'oppisopimusko

In [31]:
mod_tfidf = TfidfVectorizer(vocabulary=words, ngram_range=(1,4))
X_tfidf = mod_tfidf.fit_transform(words)

In [32]:
X2_tfidf = mod_tfidf.transform(df['cleaned_text_lower'])

In [33]:
#transform TFIDF to a dataframe
X_tfidf_df = pd.DataFrame(X2_tfidf.toarray())
X_tfidf_df.columns = mod_tfidf.get_feature_names()
X_tfidf_df
#X_tfidf_df.shape

Unnamed: 0,palkkatuki,palkkatukityöllistämisen kuvaus,työllistämisen,työllistäminen,heikossa työmarkkina-asemassa,työllistämistoimet,tarjoamaan työpaikan,oppisopimuskoulutukseen,työllistämään,oikeutettuja palkkatukeen,...,työttömiksi työnhakijoiksi,oikeutettu palkkatukeen,työttömäksi työnhakijaksi,hankinnoista duunia,työllistymistä,oppisopimuskoulutus,työllistämisehto,työllistämisvelvoite,oppisopimuskoulutettavat,palkkatukeen
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Machine Learning_Employment

In [34]:
#import packages
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_df, df['Employment_Label'], test_size=0.2, random_state=0, stratify=df['Employment_Label'])

In [36]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [37]:
sorted(zip(rf_model.feature_importances_, X_train.columns), reverse=True)[0:20]

[(0.24395392890683862, 'palkkatukityöllistämisen kuvaus'),
 (0.1382457025183335, 'työllistäminen'),
 (0.12864928783435403, 'työllistämisen'),
 (0.10646820795849735, 'palkkatuki'),
 (0.0833690843969528, 'palkkatukeen'),
 (0.054620162107257876, 'oppisopimuskoulutukseen'),
 (0.04024903227811066, 'työllistämään'),
 (0.037156467949457865, 'tarjoamaan työpaikan'),
 (0.03435626527849528, 'oikeutettuja palkkatukeen'),
 (0.032724995076233186, 'työttömiä'),
 (0.021394864874641855, 'työllistämistoimet'),
 (0.019504997880491166, 'oppisopimuskoulutettavat'),
 (0.01432288383980707, 'työllistymistä'),
 (0.013992405614012583, 'oikeutettu palkkatukeen'),
 (0.013893573997007913, 'työttömiksi työnhakijoiksi'),
 (0.005486339869623317, 'työttömäksi työnhakijaksi'),
 (0.00538089720219547, 'työllistämisehto'),
 (0.0024308309730067044, 'oppisopimuskoulutus'),
 (0.002411475328354593, 'hankinnoista duunia'),
 (0.0013885961163282996, 'työllistämisvelvoite')]

In [38]:
y_pred = rf_model.predict(X_test)
precision, recall, fscore, support = score(y_test, y_pred, pos_label='Yes_Employment', average='binary')
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3),
                                                        round(recall, 3),
                                                        round((y_pred==y_test).sum() / len(y_pred),3)))

Precision: 1.0 / Recall: 0.956 / Accuracy: 0.999


In [39]:
# save the model to disk
import joblib
filename = 'finalized_MLmodel_Employment.sav'
joblib.dump(rf_model, filename)

['finalized_MLmodel_Employment.sav']

In [40]:
# load the model from disk
loaded_model = joblib.load('finalized_MLmodel_Employment.sav')
result = loaded_model.score(X_test, y_test)
print(result)

0.9985347985347985
