# This file will be used to get data and out Category predictions

### Install required modules from requirements.txt

if requirements.txt is not present, please request it.

In [1]:
!pip3 install -r requirements.txt
!{sys.executable} -m spacy download nl_core_news_sm

You should consider upgrading via the 'python -m pip install --upgrade pip' command.

You should consider upgrading via the 'python -m pip install --upgrade pip' command.

✔ Download and installation successful
You can now load the model via spacy.load('nl_core_news_sm')


In [1]:
import pandas as pd
from collections import Counter
import numpy as np
import spacy
import nl_core_news_sm
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import random
from sklearn.model_selection import train_test_split
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
import re
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier
#from sklearn.metrics import roc_auc_score
import xgboost as xgb
import xlrd
#from imblearn.over_sampling import SMOTE
from sklearn import svm, datasets
import joblib
import pickle
import easygui
import time
import openpyxl

### Token Lemmatizer method

This method recieves a Series of incedent report texts

This method returns a Series of the Lemmatized texts

In [8]:
def custom_lemmatizer_(data):
    return [(token.lemma_) for token in nlp(data) if not token.is_digit and not token.is_punct and not token.is_space]

### Data Preprocessing method

This method takes one argument:
- Dataframe containing Incident Reports

User will need to enter the column name that contains the incident report text

This method returns preprocessed a Series of Incident Report texts

In [9]:
def preprocessing(data):
    print("\n******************** Data Preprocessing ******************\n")

    column_name = ""
    column_exists = False

    while(column_name == "" or column_exists == False):
        column_name = input("Please indicate what column contains the Incident Reports: ")
        print("Column Name: ", column_name)
        for col in data.columns:
            if(str(column_name) == str(col)):
                column_exists = True
                break

        if(column_exists == False):
            print("The column name you provided does not exist!")

    nltk.download('stopwords')

    stopword_list = set(nltk.corpus.stopwords.words('dutch'))

    #add words that aren't in the NLTK stopwords list
    added_stopwords = ["b", "u", "e", "ste", "kv", "aa", "a9", "mtr", '+', '1e', '2e', '<', '>', 'a', 'a1', 'a12', 'a2', 'a9',"één", "gis", "app", "h", 'o.a.', 'oa', 'en/of', "uur"
    "km", "mm", "v", "kg", "cm", "mm²", "mm2", "0ngever", "e", "de", "m"]
    updated_stopword_list = stopword_list.union(added_stopwords)\

    # Dutch spacy tokens 
    # nlp = nl_core_news_sm.load()

    def custom_lemmatizer(data):
        return [(token.lemma_) for token in nlp(data) if not token.is_digit and not token.is_punct and not token.is_space]

    print("Loading Vectorizer...")
    
    # tfidf_vectorizer = TfidfVectorizer(max_features=12000,  stop_words=updated_stopword_list, max_df= 0.5, tokenizer=custom_lemmatizer)
    # tfidf_vectorizer = joblib.load("tfidf_vectorizer.sav")
    tfidf_vectorizer = pickle.load(open("tfidf_vectorizer.pickle", "rb"))

    print("Data Preprocessing...")
    text = data[column_name].astype(str)
    text = text.to_numpy()
    # text_tfidf = tfidf_vectorizer.fit(text)
    tfidf_text_val = tfidf_vectorizer.transform(text)

    return tfidf_text_val, column_name



### Prediction Method

This method takes two arguments: 
- A model
- A Series of incident texts

This method then saves a file with the predicted results

Predict file columns:
- Incident Report
- Model Classification
- Probability Scores

In [4]:
def get_predictions(model, text, data, column):
    print("\n******************** Getting Predictions ******************\n")
    prob = model.predict(text) #Get predictions
    prob_score = model.predict_proba(text) #Get prediction probsblity scores
    prob_results = {"Incident_ID": [],"Incident_Report":[],"Model_Classification":[], "Probability_Scores": []}
    output_filename = ""

    print("*Output will automatically be an Excel file (.xlsx)")
    
    while(output_filename == ""):
        output_filename = input("Please enter output file name: ")
    
    for i in range (0,prob.size):
        scores = []
        prob_results["Incident_ID"].append(data["Nr."])
        prob_results["Incident_Report"].append(data[column][i])
        prob_results["Model_Classification"].append(prob_results[i])
        prob_results["Probability_Score"].append(prob_score[i])

        p = ((i + 1) / prob.size) * 100
        print ("Creating Predict File: {}%".format(round(p,2)), end="\r")

    Prob_Results = pd.DataFrame(prob_results)

    Prob_Results.to_excel(output_filename, index=False)
    print("\nResults were saved in file: ", output_filename)

### Main (index) Method

In [10]:
def main():
    print("***************************************************************")
    print("Welcome to the Indicent Report Classification System!")
    print("***************************************************************\n")

    try:
        print("Please select file (.xlsx or .csv): ")
        filename = easygui.fileopenbox()
    except ValueError:
        print("\nPlease enter a valid file!")
        print("Please try again!")
    except:
        print("Something went wrong, please ty again or conact... Somebody...")

    model_name = "Category Ongeval(pototype).sav"

    # Dutch spacy tokens 
    nlp = nl_core_news_sm.load()

    data = pd.read_excel(filename)

    model = joblib.load(model_name)

    text, column_name = preprocessing(data)

    get_predictions(model, text, data, column_name)


main()

***************************************************************
Welcome to the Indicent Report Classification System!
***************************************************************

Please select file (.xlsx or .csv): 

******************** Data Preprocessing ******************

Column Name:  Beschrijving incident
Loading Vectorizer...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\micks\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


AttributeError: Can't get attribute 'custom_lemmatizer' on <module '__main__'>

In [5]:
filename = easygui.fileopenbox()

C:\Users\micks\Projects\Candi Saftey Data Classification\Heijmans data final.xlsx


In [18]:
output_filename = ""
print(len(output_filename))

print("*Output will automatically be an Excel file (.xlsx)")
    
while(output_filename == ""):
    output_filename = input("Please enter output file name: ")

output_filename += ".xlsx"

print(output_filename)

0
*Output will automatically be an Excel file (.xlsx)
Bread.xlsx
