### 1. Bag of words implementation

#### 1.1. Import Packages

In [3]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import collections
from nltk.stem import PorterStemmer,LancasterStemmer

#### 1.2. Prepare data

In [5]:
log = pd.read_excel("D:\\next_word_prediction\\TPU_Operation_Log.xlsx")
print(log.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14417 entries, 0 to 14416
Data columns (total 9 columns):
Description    14415 non-null object
Log Time       14362 non-null object
Type           14362 non-null object
Priority       14362 non-null object
Status         14362 non-null object
Area           14038 non-null object
Site           14214 non-null object
Unit           5722 non-null object
TCV_LP_SP      0 non-null float64
dtypes: float64(1), object(8)
memory usage: 1013.8+ KB
None


In [6]:
log.head(10)

Unnamed: 0,Description,Log Time,Type,Priority,Status,Area,Site,Unit,TCV_LP_SP
0,xcxcxcxvxcvcx,"October 20, 2017 5:00 PM",Operations,Low,Open,,,,
1,adfsdsafdsafdsaf,"September 27, 2017 4:35 PM",Operations,Low,Open,,,,
2,this was changed][][] by dawkldmawmkoal;m|||,"September 08, 2017 12:00 AM",Operations,Low,Open,,,,
3,test,"September 08, 2017 11:32 AM",Operations,Low,Open,,,,
4,Critical,"August 22, 2017 4:28 PM",Oil Analysis Ferrorgraphy,Low,Open,CFB,Cogen,CFB1,
5,Tes Log,"July 05, 2017 11:41 AM",Operations,Low,Open,BUB,Cogen,BUB1,
6,Oil analysis results are Normal. entered forxx...,"January 10, 2017 10:39 AM",Oil Analysis Ferrorgraphy,Low,Open,CFB,Cogen,CFB1,
7,awdfas,"March 21, 2017 10:29 AM",Operations,Low,Open,SATTEST,Cogen,Test&Unit,
8,bobbin test bub status log,"March 16, 2017 2:02 PM",BUB Status Log,Low,Open,,Cogen,,
9,test log,"March 13, 2017 1:41 PM",BUB Status Log,Low,Open,,Cogen,,


In [7]:
print(log.columns)

Index(['Description', 'Log Time', 'Type', 'Priority', 'Status', 'Area', 'Site',
       'Unit', 'TCV_LP_SP'],
      dtype='object')


In [8]:
print(log['Type'].value_counts())

Operations                            12839
Process Steam and Water Status Log      725
BUB Status Log                          148
Backup Valve Status Log                 145
STG Status Log                          145
CFB Status Log                          145
Summary Log                             144
General                                  43
Safety                                   22
Oil Analysis Ferrorgraphy                 3
Oil Analyis Spectrography                 3
Name: Type, dtype: int64


In [9]:
print(log['Priority'].value_counts())
print('\n')
print(log['Status'].value_counts())

Low     14353
High        9
Name: Priority, dtype: int64


Information    7270
Closed         6313
Open            779
Name: Status, dtype: int64


In [None]:
print(log['Area'].value_counts())

In [10]:
data = log.loc[log['Description'].notnull(), 'Description']

#### 1.3. Process log descriptions

Step 1: Extract words from each line (tokenize). Remove stop words and punctuations.

Step 2: Loop through each line, tokenising and adding to vocabulary list.

Step 3: Count the total items and unique items in the list.

In [11]:
def extract_words(sentence):
    #stop_words = ['.',',',';','&','a','as','the','so','and','were','have','been','from','that','of','in','only','with','to']
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(sentence)
    words_cleaned = [w.lower() for w in words if w not in stop_words and re.match('^[a-z]+', w)] #and w not in string.punctuation]
    stemmer = PorterStemmer()
    vocab_stemmed = [stemmer.stem(word) for word in words_cleaned]
    return vocab_stemmed

def get_vocab(sentences):
    vocab = []
    for sentence in sentences:
        words = extract_words(sentence)
        vocab.extend(words)
    vocab = sorted(vocab)
    return vocab

In [None]:
# all words
vocab = get_vocab(data)
# unique words
vocab_set = set(sorted(vocab))
print('There are', len(vocab), ' words in the vocab with', len(vocab_set), ' unique words in the set.')

In [None]:
distr = nltk.FreqDist(vocab)
print(distr.most_common(25))
print(distr['pump'])

### POS Tagging

In [15]:
tokenized = extract_words(data[10])
tagged = nltk.pos_tag(tokenized)

In [16]:
tokenized
tagged

[('pump', 'JJ'),
 ('start', 'NN'),
 ('per', 'IN'),
 ('stop', 'NN'),
 ('stop', 'NN')]

### Bag of words

In [None]:
def bag_of_words(sentence, words):
    sentence_words = extract_words(sentence)
    
    bag = np.zeros(len(words)) # initialise zero vector
    for sw in sentence_words:
        for i,word in enumerate(words):
            if word == sw: 
                bag[i] += 1
                
    return np.array(bag)

In [None]:
# sklearn implementation
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words='english', max_features = 5000) 
# train_data_features = vectorizer.fit_transform(data[5:20])
# vectorizer.transform(["mm review test critical pump stop oil log hot"]).toarray()