In [1]:
# Import libraries

import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

In [2]:
# Import and preview training dataset

train_df = pd.read_csv('task1_trainset.csv', encoding = 'utf-8')
train_df.head()

Unnamed: 0,Id,Title,Abstract,Authors,Categories,Created Date,Task 1
0,D00001,A Brain-Inspired Trust Management Model to Ass...,Rapid popularity of Internet of Things (IoT) a...,Mahmud/Kaiser/Rahman/Rahman/Shabut/Al-Mamun/Hu...,cs.CR/cs.AI/q-bio.NC,2018-01-11,BACKGROUND OBJECTIVES METHODS METHODS RESULTS ...
1,D00002,On Efficient Computation of Shortest Dubins Pa...,"In this paper, we address the problem of compu...",Sadeghi/Smith,cs.SY/cs.RO/math.OC,2016-09-21,OBJECTIVES OTHERS METHODS/RESULTS RESULTS RESULTS
2,D00003,Data-driven Upsampling of Point Clouds,High quality upsampling of sparse 3D point clo...,Zhang/Jiang/Yang/Yamakawa/Shimada/Kara,cs.CV,2018-07-07,BACKGROUND OBJECTIVES METHODS METHODS METHODS ...
3,D00004,Accessibility or Usability of InteractSE? A He...,Internet is the main source of information now...,Aqle/Khowaja/Al-Thani,cs.HC,2018-08-29,BACKGROUND BACKGROUND BACKGROUND OBJECTIVES OB...
4,D00005,Spatio-Temporal Facial Expression Recognition ...,Automated Facial Expression Recognition (FER) ...,Hasani/Mahoor,cs.CV,2017-03-20,BACKGROUND BACKGROUND BACKGROUND BACKGROUND ME...


In [3]:
# Extract the labels and convert them into list of lists.

label = train_df['Task 1'].values.tolist()
label[:10]

a = [item.split('/') for ilabel in label for item in ilabel.split()]
a[:10]

[['BACKGROUND'],
 ['OBJECTIVES'],
 ['METHODS'],
 ['METHODS'],
 ['RESULTS'],
 ['CONCLUSIONS'],
 ['OBJECTIVES'],
 ['OTHERS'],
 ['METHODS', 'RESULTS'],
 ['RESULTS']]

In [4]:
# Separate abstract by sentence.

sent = train_df['Abstract'].values.tolist()
b = [item for isent in sent for item in isent.split('$$$')]
b[:10]

['Rapid popularity of Internet of Things (IoT) and cloud computing permits neuroscientists to collect multilevel and multichannel brain data to better understand brain functions, diagnose diseases, and devise treatments.',
 'To ensure secure and reliable data communication between end-to-end (E2E) devices supported by current IoT and cloud infrastructure, trust management is needed at the IoT and user ends.',
 'This paper introduces a Neuro-Fuzzy based Brain-inspired trust management model (TMM) to secure IoT devices and relay nodes, and to ensure data reliability.',
 'The proposed TMM utilizes node behavioral trust and data trust estimated using Adaptive Neuro-Fuzzy Inference System and weighted-additive methods respectively to assess the nodes trustworthiness.',
 'In contrast to the existing fuzzy based TMMs, the NS2 simulation results confirm the robustness and accuracy of the proposed TMM in identifying malicious nodes in the communication network.',
 'With the growing usage of clo

In [5]:
# Make a dataframe from the sentences and the labels

sentence_df = pd.DataFrame({'sentence':b, 'label':a})
sentence_df.head(10)

Unnamed: 0,sentence,label
0,Rapid popularity of Internet of Things (IoT) a...,[BACKGROUND]
1,To ensure secure and reliable data communicati...,[OBJECTIVES]
2,This paper introduces a Neuro-Fuzzy based Brai...,[METHODS]
3,The proposed TMM utilizes node behavioral trus...,[METHODS]
4,"In contrast to the existing fuzzy based TMMs, ...",[RESULTS]
5,With the growing usage of cloud based IoT fram...,[CONCLUSIONS]
6,"In this paper, we address the problem of compu...",[OBJECTIVES]
7,Given initial and final configurations of the ...,[OTHERS]
8,We provide a novel geometrical analysis of the...,"[METHODS, RESULTS]"
9,We then show how our method can be used to qui...,[RESULTS]


In [6]:
# Simple sentence cleaning using regular expression

sentence_df['sentence'] = sentence_df['sentence'].apply(lambda x: ' '.join(RegexpTokenizer(r'\w+').tokenize(x)))
sentence_df.head()

Unnamed: 0,sentence,label
0,Rapid popularity of Internet of Things IoT and...,[BACKGROUND]
1,To ensure secure and reliable data communicati...,[OBJECTIVES]
2,This paper introduces a Neuro Fuzzy based Brai...,[METHODS]
3,The proposed TMM utilizes node behavioral trus...,[METHODS]
4,In contrast to the existing fuzzy based TMMs t...,[RESULTS]


In [7]:
# Separate data into training and validation set. The labels are converted using multilabel binarizer.

mlb = MultiLabelBinarizer()

train, test = train_test_split(sentence_df)

x_train = train['sentence']
x_test = test['sentence']

y_train = mlb.fit_transform(train['label'].tolist())
y_test = mlb.fit_transform(test['label'].tolist())

In [8]:
# Generate TF-IDF feature for training set, then classifies them using Decision Tree.

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('dmt', DecisionTreeClassifier()),
            ])

NB_pipeline.fit(x_train, y_train)
predicted = NB_pipeline.predict(x_test)
print(f1_score(y_test, predicted, average = 'micro')) # Micro F1 Score, based on the evaluation standard for the AI cup.

0.4505000943574259


# Further ideas for development:
- Look into different multilabel classifiers:
    - Decision Tree (V)
    - Random Forest
    - OneVsRest over any other single label classifiers
- Classify using different text features:
    - TF-IDF
    - Document-term matrix (CountVectorizer)
    - Sentence position
    - Bigrams or trigrams
- Improve sentence cleaning:
    - Use different method for tokenization
- Consider other features available in the dataset:
    - Author name (look for writing style)
    - Paper categories