In [1]:
# #Important package for NLP
# ! pip install nltk
# ! pip install -U spacy
# ! pip install textblob
# ! pip install scikit-learn

# # for data manupulation, cleansing and plotting
# ! pip install pandas
# ! pip install matplotlib
# ! pip install numpy
# ! pip install plotly
# ! pip install dask


# ! pip list  #check installed packages
# ! pip freeze > requirements.txt #save installed packages to requirements.txt

In [14]:
import re

import pandas as pd
import numpy as np

import nltk #natural language toolkit

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score


import matplotlib.pyplot as plt


# !python -m spacy download en


import spacy
from nltk.stem import PorterStemmer

# nlp = spacy.load('en_core_web_sm')

In [3]:
# Reading the training and testing dataset
df_data = pd.read_csv('./data/train.csv')
# df_test_orginal = pd.read_csv('./data/test.csv')

# df_train= df_train_orginal.sample(n=1000) #sampling  the data to make it faster
# df_test= df_test_orginal.sample(n=1000)#sampling only 1000 rows randomly

# df_data= df_train_orginal.sample(n=10000) #sampling 10000 from 3 lakh data  the data to make it faster
df_data.head()


Unnamed: 0,id,abstract,category,category_num
0,271675,Bacteria are often exposed to multiple stimu...,q-bio-QM,138
1,412276,Accurate knowledge of the thermodynamic prop...,hep-ph-,68
2,256956,The largest X9.3 solar flare in solar cycle ...,astro-ph-SR,7
3,427612,We say that a random integer variable $X$ is...,math-PR,93
4,113852,We derive a formula expressing the joint dis...,math-CO,76


In [4]:
df_data.dtypes
# df_data.memory_usage(deep=True)

id               int64
abstract        object
category        object
category_num     int64
dtype: object

In [5]:
# convert "category " column to Categorial data to save memory space as they are repeated and in fixed no.


df_data['category'] = df_data['category'].astype('category')

# convert "id " and "category_num" column to int32 to save memory space as they are repeaded and in fixed no.
df_data[['id','category_num']] = df_data[['id','category_num']].astype('int32')

df_data.memory_usage(deep=True)

# hence sucessfully reduced memory size of "category" column from 25487734 bytes to  795575 bytes and id and category_num column from 3.4 mb to 1.5 mb

Index                 128
id                1562412
abstract        429522799
category           795575
category_num      1562412
dtype: int64

In [9]:
#subset data from category_num 0-5 only ,from train  for  training and testing

df_data_subset5_dirty = df_data[df_data['category_num']<=5]


#save to csv
df_data_subset5_dirty.to_csv('./test/dirty5.csv',index=False)

df_data_subset5_dirty.head()

len(df_data_subset5_dirty)



23027

In [7]:
#Convert to lower case, remove punctuation and special characters, newline, and numbers, digits, and stopwords
#Apply stemmer to text

import nltk

# nltk.download('stopwords')  # uncomment if stopwords is not found
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
#defining the object for stemming
porter_stemmer = PorterStemmer()

df_data = df_data.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
  
    # text = text.lower() # lowercase text
    # #replace line break with space 
    # text = re.sub(r'\n+', ' ', text) # replace newline by space
    # text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    # text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    # # text = text.replace(' x ', ' ')
    # # text = re.sub(r'\W+', '', text) # remove non-word characters from text
    # text=re.sub(r'\d+', ' ', text) # remove digits from text
    # #Steam text
    
    # text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwords from text
    # text= [porter_stemmer.stem(word) for word in text ] # stem text
    # text = ' '.join(text) # join list of words
    
    text = re.sub('[^a-z\s]', ' ', text.lower())
    text = [i.lower() for i in text.split() if i not in STOPWORDS]
    text = [porter_stemmer.stem(i) for i in text]
    text = ' '.join(text)
    text.replace('\n',' ')

    return text


In [10]:
#import dirty train data to pd
df_data_subset5_dirty = pd.read_csv('./test/dirty5.csv')

#apply cleaning function to each row of data
df_data_subset5_dirty['abstract'] = df_data_subset5_dirty['abstract'].apply(clean_text)
df_data_subset5_dirty.head()

# #save to csv
# df_data_subset5_dirty.to_csv('./test/clean5.csv',index=False)
# df_data_subset5_dirty.head()


Unnamed: 0,id,abstract,category,category_num
0,274789,main purpos paper obtain analyt solut radi tra...,astro-ph-HE,5
1,379433,search astrophys point like neutrino sourc use...,astro-ph-HE,5
2,253348,studi vertic distribut earliest stage star for...,astro-ph-GA,4
3,486373,binari evolv due dynam scatter star dens envir...,astro-ph-GA,4
4,280592,propos sever anomali observ larg angular scale...,astro-ph-CO,2


In [11]:
#clean Train data

df_data['abstract'] = df_data['abstract'].apply(clean_text)

#copy dataframe to another dataframe
df_data_clean = df_data.copy()

#save to csv
df_data_clean.to_csv('./data/train_clean.csv', index=False)

df_data_clean.head()


Unnamed: 0,id,abstract,category,category_num
0,271675,bacteria often expos multipl stimuli complex e...,q-bio-QM,138
1,412276,accur knowledg thermodynam properti zero tempe...,hep-ph-,68
2,256956,largest x solar flare solar cycl preced x flar...,astro-ph-SR,7
3,427612,say random integ variabl x monoton modulu char...,math-PR,93
4,113852,deriv formula express joint distribut cyclic v...,math-CO,76


In [12]:
# clean test data
df_test = pd.read_csv('./data/test.csv')
df_test['abstract'] = df_test['abstract'].apply(clean_text)

#save to csv
df_test.to_csv('./data/test_clean.csv', index=False)
df_test.head()


Unnamed: 0,id,abstract
0,430065,depth map obtain commerci depth sensor alway l...
1,75226,lambda express introduc java program languag p...
2,301990,propos demonstr gamma gamma collid w gg gev ad...
3,301001,physic lab student experi wide rang equit ineq...
4,280179,exist local minima one hidden layer relu netwo...


In [13]:
#clean validation data
df_validation = pd.read_csv('./data/validation.csv')
df_validation['abstract'] = df_validation['abstract'].apply(clean_text)
df_validation['abstract'] = df_validation['abstract'].str.replace('\d+', '')

#save to csv
df_validation.to_csv('./data/validation_clean.csv', index=False)
df_validation.head()

Unnamed: 0,id,abstract,category,category_num
0,27,semiclass method explain mani mesoscop effect ...,cond-mat-mes-hall,9
1,36,let motiv introduc notion dualiti main result ...,math-NT,90
2,71,construct state invari action gener squeez ope...,quant-ph-,150
3,82,say lie super algebra symmetr everi root respe...,math-RT,96
4,84,paper defin lagrangian scalar gaug field causa...,physics-gen-ph,120


In [24]:
# convert validation_subset.csv to dataframe 
df_validatation_subset = pd.read_csv('./test/validation_subset.csv')
df_validatation_subset.head()

Unnamed: 0,id,abstract,category,category_num
0,27,semiclass method explain mani mesoscop effect ...,cond-mat-mes-hall,9
1,494,studi rotat spectrum nacn x recent extend freq...,astro-ph-GA,4
2,567,present ghz sub arcminut resolut imag sunyaev ...,astro-ph-CO,2
3,726,na xcoo fascin complex magnet phase diagram ma...,cond-mat-mtrl-sci,10
4,946,report care finit size scale studi metal insul...,cond-mat-mes-hall,9


In [31]:
# #convert validation subset to dataframe
# df_validation_subset = pd.read_csv('./test/validation_subset.csv')
# df_validation_subset.head()

# #count number of catetogy_num in validation_subset.csv
 
# df_validatation_subset.category_num.value_counts()

10    1077
9      996
4      994
5      841
7      718
2      537
3      500
6      359
8      128
Name: category_num, dtype: int64

In [32]:
# #convert solution to csv
# df_test_solution = pd.read_csv('./test/solution.csv')
# df_test_solution.category_num.value_counts()



9     1142
4     1064
5      928
10     740
7      708
2      548
3      395
6      385
8      236
0        4
Name: category_num, dtype: int64

In [12]:
#pandas
# # The maximum number of words to be used. (most frequent)
MAX_NB_WORDS =50000
# Max number of words in each abstract
MAX_SEQUENCE_LENGTH = 300
# This is fixed.
EMBEDDING_DIM = 100

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True) 
tokenizer.fit_on_texts(df_data['abstract'].values) # fit the tokenizer on the abstracts
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 622023 unique tokens.


In [13]:
#dask
# # The maximum number of words to be used. (most frequent)
MAX_NB_WORDS =50000
# Max number of words in each abstract
MAX_SEQUENCE_LENGTH = 300
# This is fixed.
EMBEDDING_DIM = 100

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True) 
tokenizer.fit_on_texts(df_dask_data.abstract) # fit the tokenizer on the abstracts
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

NameError: name 'df_dask_data' is not defined

In [None]:
from keras.preprocessing.sequence import pad_sequences

X = tokenizer.texts_to_sequences(df_data['abstract'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)


 


Shape of data tensor: (390603, 300)


In [None]:
#dask
from keras.preprocessing.sequence import pad_sequences



X = tokenizer.texts_to_sequences(df_dask_data.abstract)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (390603, 300)


In [None]:
#pandas

Y = pd.get_dummies(df_data['category']).values
print('Shape of label tensor:', Y.shape)


Shape of label tensor: (390603, 156)


In [None]:
#dask

Y = pd.get_dummies(df_dask_data.category)
print('Shape of label tensor:', Y.shape)


Shape of label tensor: (390603, 156)


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)


(351542, 300) (351542, 156)
(39061, 300) (39061, 156)


In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

In [None]:
# # save trained model to disk
# model.save('./Test_model/model1.h5')

# # # import load model module 

from keras.models import *
new_model = load_model('./Saved_model\modelcolabbest.h5')

In [None]:
# accr = new_model.evaluate(X_test,Y_test)
# print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
#dask
# #csv to dask dataframe 
# df_dask_test=dd.read_csv('./Data/test.csv')

#pandas df from csv 
df_test=pd.read_csv('./data/test.csv')

#dask df from pandas df
df_dask_test=dd.from_pandas(df_test,npartitions=5)


In [None]:
df_dask_test.head()

Unnamed: 0,id,abstract
0,430065,Depth maps obtained by commercial depth sens...
1,75226,When lambda expressions were introduced to t...
2,301990,We propose and demonstrate that a gamma-gamm...
3,301001,"In physics labs, students experience a wide ..."
4,280179,The existence of local minima for one-hidden...


In [None]:
for index, row in df_dask_test.iterrows():
    new_abstract = [row['abstract']]
    seq = tokenizer.texts_to_sequences(new_abstract)
    padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
    pred = new_model.predict(padded)
    print ('Currently processing Index no.:  ', index, 'and prediction is: ', pred.argmax())
    

Currently processing Index no.:   0 and prediction is:  62
Currently processing Index no.:   1 and prediction is:  7
Currently processing Index no.:   2 and prediction is:  123
Currently processing Index no.:   3 and prediction is:  40
Currently processing Index no.:   4 and prediction is:  113
Currently processing Index no.:   5 and prediction is:  110
Currently processing Index no.:   6 and prediction is:  110
Currently processing Index no.:   7 and prediction is:  110
Currently processing Index no.:   8 and prediction is:  110
Currently processing Index no.:   9 and prediction is:  8
Currently processing Index no.:   10 and prediction is:  28
Currently processing Index no.:   11 and prediction is:  110
Currently processing Index no.:   12 and prediction is:  45
Currently processing Index no.:   13 and prediction is:  24
Currently processing Index no.:   14 and prediction is:  5
Currently processing Index no.:   15 and prediction is:  24
Currently processing Index no.:   16 and predi

KeyboardInterrupt: 