# Libraries

In [16]:
import os
import re

from string import punctuation
from autocorrect import Speller

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['savefig.facecolor'] = "0.8"
plt.rcParams.update({'figure.figsize': (15, 5), 'figure.dpi': 120})
plt.style.use('fivethirtyeight')

import nltk
import gensim

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import RegexpTokenizer
nltk.download('punkt')
nltk.download('stopwords') 
stemmer = WordNetLemmatizer()

from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer

import os, re, operator, warnings
warnings.filterwarnings('ignore')
%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adils\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adils\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Fetching

In [2]:
# define csv path
train_path = os.path.join("../data/train/Train.csv")
tags_path = os.path.join("../data/tags/Tags.csv")
test_path = os.path.join("../data/test/Test.csv")

In [3]:
# import data from csv files
df_train = pd.read_csv(train_path, low_memory=False)
df_tags = pd.read_csv(tags_path, low_memory=False)
df_test = pd.read_csv(test_path, low_memory=False)

# Data Scrapping

In [4]:
# visualize train/test datasets
df_train.head()

Unnamed: 0,id,ABSTRACT,Computer Science,Mathematics,Physics,Statistics,Analysis of PDEs,Applications,Artificial Intelligence,Astrophysics of Galaxies,...,Methodology,Number Theory,Optimization and Control,Representation Theory,Robotics,Social and Information Networks,Statistics Theory,Strongly Correlated Electrons,Superconductivity,Systems and Control
0,1824,a ever-growing datasets inside observational a...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3094,we propose the framework considering optimal $...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8463,nanostructures with open shell transition meta...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2082,stars are self-gravitating fluids inside which...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8687,deep neural perception and control networks ar...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df_test.head()

Unnamed: 0,id,ABSTRACT,Computer Science,Mathematics,Physics,Statistics
0,9409,fundamental frequency (f0) approximation from ...,0,0,0,1
1,17934,"this large-scale study, consisting of 24.5 mil...",1,0,0,1
2,16071,we present a stability analysis of the plane c...,0,0,1,0
3,16870,we construct finite time blow-up solutions to ...,0,1,0,0
4,10496,planetary nebulae (pne) constitute an importan...,0,0,1,0


In [14]:
# dataset summary
df_train.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14004 entries, 0 to 14003
Data columns (total 31 columns):
 #   Column                                        Non-Null Count  Dtype 
---  ------                                        --------------  ----- 
 0   id                                            14004 non-null  int64 
 1   ABSTRACT                                      14004 non-null  object
 2   Computer Science                              14004 non-null  int64 
 3   Mathematics                                   14004 non-null  int64 
 4   Physics                                       14004 non-null  int64 
 5   Statistics                                    14004 non-null  int64 
 6   Analysis of PDEs                              14004 non-null  int64 
 7   Applications                                  14004 non-null  int64 
 8   Artificial Intelligence                       14004 non-null  int64 
 9   Astrophysics of Galaxies                      14004 non-null  int64 
 10

# Data Wrangling

In [5]:
# define features
TAGS_COLS = ['Analysis of PDEs', 'Applications',
               'Artificial Intelligence', 'Astrophysics of Galaxies',
               'Computation and Language', 'Computer Vision and Pattern Recognition',
               'Cosmology and Nongalactic Astrophysics',
               'Data Structures and Algorithms', 'Differential Geometry',
               'Earth and Planetary Astrophysics', 'Fluid Dynamics',
               'Information Theory', 'Instrumentation and Methods for Astrophysics',
               'Machine Learning', 'Materials Science', 'Methodology', 'Number Theory',
               'Optimization and Control', 'Representation Theory', 'Robotics',
               'Social and Information Networks', 'Statistics Theory',
               'Strongly Correlated Electrons', 'Superconductivity',
               'Systems and Control']

TOPIC_COLS = ['Computer Science', 'Mathematics', 'Physics', 'Statistics']


# Model Building

## Supervised learning (LDA)

In [6]:
# drop tags features
train_spvd = df_train.drop(columns=TAGS_COLS, axis=1)

In [7]:
train_spvd

Unnamed: 0,id,ABSTRACT,Computer Science,Mathematics,Physics,Statistics
0,1824,a ever-growing datasets inside observational a...,0,0,1,0
1,3094,we propose the framework considering optimal $...,1,0,0,0
2,8463,nanostructures with open shell transition meta...,0,0,1,0
3,2082,stars are self-gravitating fluids inside which...,0,0,1,0
4,8687,deep neural perception and control networks ar...,1,0,0,0
...,...,...,...,...,...,...
13999,8699,a methodology of automatic detection of a even...,1,0,0,0
14000,11912,we consider a case inside which the robot has ...,1,0,0,0
14001,4842,despite being usually considered two competing...,0,0,1,0
14002,12507,we present the framework and its implementatio...,1,0,0,0


### Data preprocessing

**General Outline of Text Preprocessing** (https://www.kdnuggets.com/2017/12/general-approach-preprocessing-text-data.html)

So how do we go about doing text preprocessing? Generally, there are 3 main components:

    -Tokenization
    -Normalization
    -Noise removal

In a nutshell, tokenization is about splitting strings of text into smaller pieces, or “tokens”. Paragraphs can be tokenized into sentences and sentences can be tokenized into words. Normalization aims to put all text on a level playing field, e.g., converting all characters to lowercase. Noise removal cleans up the text, e.g., remove extra whitespaces.

<div>
    <img src="https://drek4537l1klr.cloudfront.net/chollet2/Figures/11-01.png" width="600"/>
</div>

In [125]:
# define preprocessing function
def preprocess_articles(df, text_col):
    # define df copy for data treatement
    df_copy = df.copy()

    # drop rows with empty values
    df_copy.dropna(inplace=True)
    
    for word in df[text_col]:
        # case normalization (case_lower)
        df_copy['preprocessed_ABSTRACT'] = word.lower()

        # spell check
        # spell = Speller('en')
        # df_copy['preprocessed_ABSTRACT'] = [spell(w) for w in df_copy['preprocessed_ABSTRACT']]

        # remove extra white spaces
        df_copy['preprocessed_ABSTRACT'] = [re.sub("s+"," ", w) for w in df_copy['preprocessed_ABSTRACT']]

        # remove punctuations
        df_copy['preprocessed_ABSTRACT'] = [re.sub("[^-9A-Za-z ]", "" , w) for w in df_copy['preprocessed_ABSTRACT']]

        # remove numbers
        df_copy['preprocessed_ABSTRACT'] = ''.join(c for c in df_copy['preprocessed_ABSTRACT'] if not c.isdigit())

        # remove punctuation
        # df_copy['preprocessed_ABSTRACT'] = ''.join(c for c in df_copy['preprocessed_ABSTRACT'] if c not in punctuation)

        # stop words removal
        stopword = stopwords.words('english')
        df_copy['preprocessed_ABSTRACT'] = [w for w in df_copy['preprocessed_ABSTRACT'] if w not in stopword]
            
        # lemmatize words
        wordnet_lemmatizer = WordNetLemmatizer()
        df_copy['preprocessed_ABSTRACT'] = [wordnet_lemmatizer.lemmatize(w) for w in df_copy['preprocessed_ABSTRACT']]

        # stemming words
        # porter_stemmer = PorterStemmer()
        # df_copy['preprocessed_ABSTRACT'] = [porter_stemmer.stem(w) for w in df_copy['preprocessed_ABSTRACT']]
    
    return df_copy

In [24]:
# define preprocessing function
def clean_tweets(df, text_col):
    
    df_copy = df.copy()
    
    # drop rows with empty values
    df_copy.dropna(inplace=True)
    
    # lower text
    df_copy['preprocessed_' + text_col] = df_copy[text_col].str.lower()

    # remove extra white spaces
    df_copy['preprocessed_' + text_col] = df_copy['preprocessed_' + text_col].apply(lambda row: re.sub("s+"," ", row))

    # remove punctuations
    df_copy['preprocessed_ABSTRACT'] = df_copy['preprocessed_' + text_col].apply(lambda row: re.sub("[^-9A-Za-z ]", "" , row))
    
    # filter out stop word
    stop_words = set(stopwords.words('english'))        
    df_copy['preprocessed_' + text_col] = df_copy['preprocessed_' + text_col].apply(lambda row: ' '.join([word for word in row.split() if (not word in stop_words)]))
    
    # tokenize 
    tokenizer = RegexpTokenizer('[a-zA-Z]\w+\'?\w*')
    df_copy['tokenized_' + text_col] = df_copy['preprocessed_' + text_col].apply(lambda row: tokenizer.tokenize(row))
    
    return df_copy


In [25]:
clean_tweets(train_spvd, 'ABSTRACT')

Unnamed: 0,id,ABSTRACT,Computer Science,Mathematics,Physics,Statistics,preprocessed_ABSTRACT,tokenized_ABSTRACT
0,1824,a ever-growing datasets inside observational a...,0,0,1,0,ever-growing data et ide ob ervational tronomy...,"[ever, growing, data, et, ide, ob, ervational,..."
1,3094,we propose the framework considering optimal $...,1,0,0,0,propo e framework con idering optimal t-matchi...,"[propo, framework, con, idering, optimal, matc..."
2,8463,nanostructures with open shell transition meta...,0,0,1,0,nano tructure open hell tran ition metal molec...,"[nano, tructure, open, hell, tran, ition, meta..."
3,2082,stars are self-gravitating fluids inside which...,0,0,1,0,tar elf-gravitating fluid ide pre ure buoyancy...,"[tar, elf, gravitating, fluid, ide, pre, ure, ..."
4,8687,deep neural perception and control networks ar...,1,0,0,0,deep neural perception control network likely ...,"[deep, neural, perception, control, network, l..."
...,...,...,...,...,...,...,...,...
13999,8699,a methodology of automatic detection of a even...,1,0,0,0,methodology automatic detection event ba infor...,"[methodology, automatic, detection, event, ba,..."
14000,11912,we consider a case inside which the robot has ...,1,0,0,0,con ider ca e ide robot ha navigate ide unknow...,"[con, ider, ca, ide, robot, ha, navigate, ide,..."
14001,4842,despite being usually considered two competing...,0,0,1,0,de pite u ually con idered two competing pheno...,"[de, pite, ually, con, idered, two, competing,..."
14002,12507,we present the framework and its implementatio...,1,0,0,0,pre ent framework implementation relying natur...,"[pre, ent, framework, implementation, relying,..."
