In [89]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd, xgboost, numpy, textblob, string

from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

from gensim.models import Word2Vec, KeyedVectors

from nltk.corpus import stopwords

In [3]:
def load_data(path):
    return pd.read_csv(path)

In [12]:
def load_excel(path, sheet_name):
    return pd.read_excel(path, sheet_name = sheet_name)

In [91]:
data = load_excel('C:\\Users\\akash\\Desktop\\TrainingData.xlsx', 'Raw Data')

In [90]:
def clean_classes(industry_class):
    return industry_class.split(';')[0].split(' (')[0]

def remove_stopwords(industry_des):
    stop = stopwords.words('english')
    return ' '.join([word for word in industry_des.split() if word not in (stop)])

In [93]:
data['Industry Classifications'] = data['Industry Classifications'].apply(clean_classes)
data['Business Description'] = data['Business Description'].apply(remove_stopwords)

In [95]:
mod_data = data.loc[data['Industry Classifications'].isin(['Banks', 'Healthcare', 
'Biotechnology', 'Energy', 'Consumer Discretionary', 'Information Technology', 
'Capital Goods', 'Commercial and Professional Services', 'Application Software',
'Communications Equipment', 'Asset Management and Custody Banks', 
'Consumer Staples', 'Chemicals', 'Application Hosting Services',
'Aerospace and Defense', 'Electronic Equipment and Instruments', 'Advertising',
'Health Care Technology', 'Auto Components', 
'Data Processing and Outsourced Services'])]

In [96]:
del mod_data['Company Name']
del mod_data['Exchange:Ticker']
del mod_data['Company Type']
del mod_data['Company Status']
del mod_data['Geographic Locations']
del mod_data['Security Tickers']

In [105]:
# split the dataset
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(
                        mod_data['Business Description'], 
                        mod_data['Industry Classifications'])

# label encode the target variables 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [106]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(mod_data['Business Description'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [107]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(mod_data['Business Description'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(mod_data['Business Description'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(mod_data['Business Description'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

In [83]:
# load the pre-trained word-embedding vectors 
model = KeyedVectors.load_word2vec_format('C:\\Users\\akash\\Downloads\\GoogleNews-vectors-negative300.bin', binary=True)


In [172]:
# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(mod_data['Business Description'])
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=100)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=100)

In [116]:
# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    if word in model.vocab:
        embedding_matrix[i] = embedding_vector

In [119]:
pd.options.mode.chained_assignment = None

mod_data['char_count'] = mod_data['Business Description'].apply(len)
mod_data['word_count'] = mod_data['Business Description'].apply(lambda x: len(x.split()))
mod_data['word_density'] = mod_data['char_count'] / (mod_data['word_count']+1)
mod_data['punctuation_count'] = mod_data['Business Description'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
mod_data['title_word_count'] = mod_data['Business Description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
mod_data['upper_case_word_count'] = mod_data['Business Description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [122]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

mod_data['noun_count'] = mod_data['Business Description'].apply(lambda x: check_pos_tag(x, 'noun'))
mod_data['verb_count'] = mod_data['Business Description'].apply(lambda x: check_pos_tag(x, 'verb'))
mod_data['adj_count'] = mod_data['Business Description'].apply(lambda x: check_pos_tag(x, 'adj'))
mod_data['adv_count'] = mod_data['Business Description'].apply(lambda x: check_pos_tag(x, 'adv'))
mod_data['pron_count'] = mod_data['Business Description'].apply(lambda x: check_pos_tag(x, 'pron'))

In [123]:
# Function to train models other than NNs
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [125]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.64367816092
NB, WordLevel TF-IDF:  0.520114942529
NB, N-Gram Vectors:  0.474137931034
NB, CharLevel Vectors:  0.405172413793


In [127]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print( "LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print( "LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print( "LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("LR, CharLevel Vectors: ", accuracy)

LR, Count Vectors:  0.672413793103
LR, WordLevel TF-IDF:  0.640804597701
LR, N-Gram Vectors:  0.502873563218
LR, CharLevel Vectors:  0.632183908046


In [227]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(kernel='linear'), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)

SVM, N-Gram Vectors:  0.534482758621


In [129]:
# Random Forests on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print( "RF, Count Vectors: ", accuracy)

# Random Forests on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print( "RF, WordLevel TF-IDF: ", accuracy)

RF, Count Vectors:  0.508620689655
RF, WordLevel TF-IDF:  0.540229885057


In [130]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print( "Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print( "Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print( "Xgb, CharLevel Vectors: ", accuracy)

Xgb, Count Vectors:  0.603448275862
Xgb, WordLevel TF-IDF:  0.603448275862
Xgb, CharLevel Vectors:  0.612068965517


In [158]:
# ANNs
import os, numpy as np
glove_dir = 'C:\\Users\\akash\\Desktop\\glove.6B\\'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [182]:
embedding_dim = 100
max_words = 20000
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [180]:
# ANN 
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
maxlen = 100
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(20, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 100, 100)          10000000  
_________________________________________________________________
flatten_5 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_43 (Dense)             (None, 64)                640064    
_________________________________________________________________
dense_44 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_45 (Dense)             (None, 20)                1300      
Total params: 10,645,524
Trainable params: 10,645,524
Non-trainable params: 0
_________________________________________________________________


In [184]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy',
                            metrics=['accuracy'])

# Switching the train and valid sets for training.
history = model.fit(valid_seq_x, valid_y,
                    epochs=10,
                    batch_size=8,
                    validation_data=(train_seq_x, train_y))
# Not working because of windows.
# model.save_weights('ANN_model.h5')

Train on 348 samples, validate on 1043 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [233]:
# LSTM
from keras.layers import LSTM, SimpleRNN
model = Sequential()
model.add(Embedding(20000, 8))
# model.add(LSTM(64, return_sequences=True))
# model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(20, activation='softmax'))
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_35 (Embedding)     (None, None, 8)           160000    
_________________________________________________________________
lstm_49 (LSTM)               (None, None, 64)          18688     
_________________________________________________________________
lstm_50 (LSTM)               (None, 64)                33024     
_________________________________________________________________
dense_70 (Dense)             (None, 20)                1300      
Total params: 213,012
Trainable params: 213,012
Non-trainable params: 0
_________________________________________________________________


In [234]:
model.compile(optimizer='rmsprop',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
# Switching the train and valid sets for training.
history = model.fit(valid_seq_x, valid_y,
                    epochs=10,
                    batch_size=8,
                    validation_data=(train_seq_x, train_y))

Train on 348 samples, validate on 1043 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
