In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('df.csv')

In [3]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df.drop('text',axis=1)
xtrain, xtest, ytrain, ytest = train_test_split(X,y, random_state=42, test_size=0.2)

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform(ytrain[['Industry1','Industry2']].values)
test_labels = mlb.fit_transform(ytest[['Industry1','Industry2']].values)

In [5]:
mlb.classes_


array(['Agriculture', 'Consumer Products', 'Energy', 'Finance',
       'Health Care', 'Manufacturing', 'Media', 'Pharmaceuticals',
       'Public and Social sector', 'Telecom', 'Transport & Logistics',
       'automative'], dtype=object)

In [6]:
from wordcloud import STOPWORDS
english_stops = set(STOPWORDS)

import nltk
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [8]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize


wordnet_lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


def tokenize_lemma_stopwords(text):
    text = text.replace("\n", " ")
    tokens = nltk.tokenize.word_tokenize(text.lower()) # split string into words (tokens)
    tokens = [t for t in tokens if t.isalpha()] # keep strings with only alphabets
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [stemmer.stem(t) for t in tokens]
    tokens = [t for t in tokens if len(t) > 2] # remove short words, they're probably not useful
    tokens = [t for t in tokens if t not in english_stops] # remove stopwords
    cleanedText = " ".join(tokens)
    return cleanedText

def dataCleaning(df):
    data = df.copy()
    data = data.apply(tokenize_lemma_stopwords)
    return data

In [9]:
cleanedTrainData = dataCleaning(xtrain)
cleanedTestData = dataCleaning(xtest)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

vectorizer = TfidfVectorizer()
vectorised_train_documents = vectorizer.fit_transform(cleanedTrainData)
vectorised_test_documents = vectorizer.transform(cleanedTestData)

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, hamming_loss

ModelsPerformance = {}

def metricsReport(modelName, test_labels, predictions):
    accuracy = accuracy_score(test_labels, predictions)

    macro_precision = precision_score(test_labels, predictions, average='macro')
    macro_recall = recall_score(test_labels, predictions, average='macro')
    macro_f1 = f1_score(test_labels, predictions, average='macro')

    micro_precision = precision_score(test_labels, predictions, average='micro')
    micro_recall = recall_score(test_labels, predictions, average='micro')
    micro_f1 = f1_score(test_labels, predictions, average='micro')
    hamLoss = hamming_loss(test_labels, predictions)
    print("------" + modelName + " Model Metrics-----")
    print("Accuracy: {:.4f}\nHamming Loss: {:.4f}\nPrecision:\n  - Macro: {:.4f}\n  - Micro: {:.4f}\nRecall:\n  - Macro: {:.4f}\n  - Micro: {:.4f}\nF1-measure:\n  - Macro: {:.4f}\n  - Micro: {:.4f}"\
          .format(accuracy, hamLoss, macro_precision, micro_precision, macro_recall, micro_recall, macro_f1, micro_f1))
    ModelsPerformance[modelName] = micro_f1

In [15]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier


svmClassifier = OneVsRestClassifier(LinearSVC(), n_jobs=-1)
svmClassifier.fit(vectorised_train_documents, train_labels)

svmPreds = svmClassifier.predict(vectorised_test_documents)
metricsReport("SVC Sq. Hinge Loss", test_labels, svmPreds)

------SVC Sq. Hinge Loss Model Metrics-----
Accuracy: 0.3190
Hamming Loss: 0.0920
Precision:
  - Macro: 0.8128
  - Micro: 0.8120
Recall:
  - Macro: 0.4481
  - Micro: 0.5828
F1-measure:
  - Macro: 0.5575
  - Micro: 0.6786


In [16]:
from skmultilearn.problem_transform import LabelPowerset

powerSetSVC = LabelPowerset(LinearSVC())
powerSetSVC.fit(vectorised_train_documents, train_labels)

powerSetSVCPreds = powerSetSVC.predict(vectorised_test_documents)
metricsReport("Power Set SVC", test_labels, powerSetSVCPreds)

------Power Set SVC Model Metrics-----
Accuracy: 0.4724
Hamming Loss: 0.1048
Precision:
  - Macro: 0.7078
  - Micro: 0.6856
Recall:
  - Macro: 0.5795
  - Micro: 0.6856
F1-measure:
  - Macro: 0.6118
  - Micro: 0.6856
