In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.classify import apply_features
from nltk.classify import NaiveBayesClassifier
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob


In [4]:
def extract_features(text):
    tokens = word_tokenize(text.lower())
    features = {word: (word in tokens) for word in words}
    return features

stopwords = nltk.corpus.stopwords.words('english')

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token not in stopwords]
    filtered_token = [x for x in filtered_tokens if x not in [',','.','\'']]
    # print(f"Filtered Token: {filtered_tokens}")
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_token]
    processed_text = ' '.join(lemmatized_tokens)
    # print(f"Lemmatized text: {lemmatized_tokens}")
    return processed_text

In [5]:
from dummy_data import shift_log_data, workers_data, training_data

In [6]:
process_training_data = [preprocess_text(text) for text, _ in training_data]

In [7]:
process_training_data

['issue',
 'work completed',
 'routine inspection completed without issue',
 'minor delay shovel operation',
 'overburden removal delay due weather',
 'delay material supply',
 'work delayed due unforeseen circumstance',
 'drill malfunction',
 'blasting equipment issue ; delay obd removal',
 'drill performance issue',
 'unexpected machinery breakdown',
 'faulty conveyor belt',
 'routine maintenance',
 'maintenance scheduling conflict',
 'power outage affecting operation',
 'overtime work due shift handover delay',
 'unexpected increase workload',
 'supervisor late 12 hour',
 'insufficient raw material',
 'delayed delivery spare part',
 'inadequate stock essential supply',
 'shortage critical resource',
 'operator error causing delay',
 'incorrectly set machine parameter',
 'employee absence causing delay',
 'mistake task execution',
 'safety protocol breach',
 'emergency evacuation drill conducted',
 'safety gear available',
 'safety hazard identified workplace']

In [8]:
preprocess_text(text='Blasting equipment issue, issue in the machine chains and the engine of the machine')

'blasting equipment issue issue machine chain engine machine'

In [9]:
words = set(word for sentence in process_training_data for word in word_tokenize(sentence.lower()))

In [10]:
extract_features(text='Malfunction of the Shovel machine in the OBD removal')

{'conducted': False,
 'critical': False,
 'overburden': False,
 'supply': False,
 'conflict': False,
 'causing': False,
 'material': False,
 'due': False,
 'unforeseen': False,
 'inspection': False,
 'shift': False,
 'without': False,
 'incorrectly': False,
 'operator': False,
 'equipment': False,
 'faulty': False,
 'workplace': False,
 'malfunction': True,
 'machinery': False,
 'inadequate': False,
 'hazard': False,
 'handover': False,
 'absence': False,
 'stock': False,
 'removal': True,
 'late': False,
 'error': False,
 'conveyor': False,
 'completed': False,
 'delivery': False,
 'routine': False,
 'employee': False,
 'essential': False,
 'mistake': False,
 'shortage': False,
 'delayed': False,
 'workload': False,
 'resource': False,
 'machine': True,
 'power': False,
 'performance': False,
 'available': False,
 'gear': False,
 'overtime': False,
 'emergency': False,
 'drill': False,
 'unexpected': False,
 'parameter': False,
 'increase': False,
 ';': False,
 'spare': False,
 'task'

In [46]:
sentiment_analyzer = SentimentIntensityAnalyzer()
def get_sentiment(text):
    scores = sentiment_analyzer.polarity_scores(text)
    sentiment = scores['pos']
    return sentiment

In [11]:
def identify_issue(issue):
    features = extract_features(issue)
    return classifier.classify(features)

In [12]:
training_features = [(extract_features(text), label) for text, label in training_data]
classifier = NaiveBayesClassifier.train(training_features)

In [13]:
new_issues = [data[7] for worker_id, data in shift_log_data.items()]

In [15]:
for issue in new_issues:
    classification = identify_issue(issue)
    print(f"Issue: '{issue}' classified as: {classification}.")

Issue: 'No issues' classified as: Equipment Issue.
Issue: 'Minor delay in shovel operation' classified as: Delay.
Issue: 'Drill malfunction' classified as: Equipment Issue.
Issue: 'Blasting equipment issue; Delay in OBD removal' classified as: Equipment Issue.
Issue: 'No issues' classified as: Equipment Issue.
Issue: 'All tasks completed' classified as: No Issue.
