# Importing libraries

In [1]:
import pdftotext
import pandas as pd
import numpy as np
import string
import re
import os
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Extracting text from PDFs

In [2]:
documents = []

filenames = []

for filename in os.listdir('training'):
    
    if 'pdf' in filename:
        
        filenames.append(int(filename[:-4]))
        
        with open('training/' + filename, 'rb') as file:

            text = ''
            
            for page in pdftotext.PDF(file):
                
                text += page
                
    documents.append(text)

# Preparing a dataframe

In [3]:
data = pd.DataFrame()

data['id'] = filenames

data['text'] = documents

In [4]:
data.head()

Unnamed: 0,id,text
0,160,Australia’s #1 job site\n\n\nIT A...
1,174,Australia’s #1 job site\n\n\n\n\nM...
2,148,Australia’s #1 job site\n\n\nTe...
3,49,Australia’s #1 job site\n\n\nMarke...
4,61,Australia’s #1 job site\n\n\n\n\n...


# Cleaning text

In [5]:
# A function to clean the text
def clean_text(text):
    
    # remove punctutations
    text = re.sub("[^a-zA-Z]"," ",text)
    
    # remove numbers
    text = re.sub(r'\d+', '', text)
    
    #convert to lower
    text = text.lower()
    
    #remove whitespaces
    text = ' '.join(text.split())
    
    return text

In [6]:
data['text'] = data['text'].apply(lambda x: clean_text(x))

In [7]:
data.head()

Unnamed: 0,id,text
0,160,australia s job site it application developer ...
1,174,australia s job site microsoft modern workplac...
2,148,australia s job site test analyst professional...
3,49,australia s job site marketing and communicati...
4,61,australia s job site business analyst murdoch ...


# Removing stopwords

In [8]:
stop = set(stopwords.words('english'))

# A function to remove stopwords and short length words (<2)
def remove_stopwords(text): 
    
    new = []
    for word in text.split():
        if word not in stop and len(word) >1:
            new.append(word)
            
    return ' '.join(new)

In [9]:
data['text'] = data['text'].apply(lambda x: remove_stopwords(x))

In [10]:
data.sort_values(by = 'id', inplace=True)

data.reset_index(drop = True, inplace=True)

In [11]:
data.head()

Unnamed: 0,id,text
0,1,australia job site senior data scientist seek ...
1,2,australia job site senior data engineer langua...
2,3,australia job site data scientist victoria uni...
3,4,australia job site innovation data scientist e...
4,5,australia job site data scientist ausnet servi...


# Reading pre-labeled target classes

In [12]:
training_labels = pd.read_csv('training.csv')

In [13]:
training_labels.head()

Unnamed: 0,id,target1,target2,target3
0,1,1,1,1
1,2,0,0,0
2,3,0,0,1
3,4,0,0,0
4,5,1,0,1


In [14]:
data = pd.merge(data, training_labels)

In [15]:
data.head()

Unnamed: 0,id,text,target1,target2,target3
0,1,australia job site senior data scientist seek ...,1,1,1
1,2,australia job site senior data engineer langua...,0,0,0
2,3,australia job site data scientist victoria uni...,0,0,1
3,4,australia job site innovation data scientist e...,0,0,0
4,5,australia job site data scientist ausnet servi...,1,0,1


# Splitting data and Implementing bag of words

In [16]:
# `BAG OF WORDS`
countvector= CountVectorizer(ngram_range=(2,2))

In [17]:
labels = ['target1', 'target2', 'target3']

In [18]:
text_features = countvector.fit_transform(data['text'])    
text_features = text_features.toarray()    
target_labels = np.array(data[labels])
Xtrain, Xtest, Ytrain, Ytest = train_test_split(text_features, target_labels, test_size=0.2, random_state = 200) 

# Creating and evaluating a machine learning model 

In [19]:
models = []

for i in range(3):
    
    rf = RandomForestClassifier(n_estimators=200)
    models.append(rf.fit(Xtrain, Ytrain[:,i]))

In [20]:
predictions = []
for model in models:
    predictions.append(model.predict(Xtest))

In [21]:
for i in range(3):
    print('\t Classification report for', labels[i], '\n')
    print(metrics.classification_report(Ytest[:,i], predictions[i],  digits=5))
#     plot_confusion_matrix(Ytest[i], predictions[i])
#     plot_roc_curve(Ytest[i], predictions[i])

	 Classification report for target1 

              precision    recall  f1-score   support

           0    0.55263   0.95455   0.70000        22
           1    0.50000   0.05556   0.10000        18

    accuracy                        0.55000        40
   macro avg    0.52632   0.50505   0.40000        40
weighted avg    0.52895   0.55000   0.43000        40

	 Classification report for target2 

              precision    recall  f1-score   support

           0    0.64103   1.00000   0.78125        25
           1    1.00000   0.06667   0.12500        15

    accuracy                        0.65000        40
   macro avg    0.82051   0.53333   0.45313        40
weighted avg    0.77564   0.65000   0.53516        40

	 Classification report for target3 

              precision    recall  f1-score   support

           0    0.45714   0.94118   0.61538        17
           1    0.80000   0.17391   0.28571        23

    accuracy                        0.50000        40
   macro avg  

In [22]:
def count_values(x):
    zeros = 0
    ones = 0
    for each in x:
        if each == 0:
            zeros += 1
        else:
            ones += 1
            
    print('0 :', zeros)
    print('1 :', ones)