# Import library and modules

In [1]:
import re # regular expressions for parsing
import pandas as pd # pandas for dataframes
import numpy as np # numpy for math
import nltk #natural language toolkit tokenization and stemming 
import matplotlib.pyplot as plt    # matplotlib for plotting
from nltk.corpus import stopwords # stopwords
from nltk.stem import SnowballStemmer # stemming


# Import Train data and Test data to pandas dataframe

In [2]:
df_data = pd.read_csv('./data/train.csv') #  # read the train data in to dataframe 
df_test= pd.read_csv('./data/test.csv') # # read the test data in to dataframe 

df_data.head() # print the first 5 rows of the Tranning data



Unnamed: 0,id,abstract,category,category_num
0,271675,Bacteria are often exposed to multiple stimu...,q-bio-QM,138
1,412276,Accurate knowledge of the thermodynamic prop...,hep-ph-,68
2,256956,The largest X9.3 solar flare in solar cycle ...,astro-ph-SR,7
3,427612,We say that a random integer variable $X$ is...,math-PR,93
4,113852,We derive a formula expressing the joint dis...,math-CO,76


## Memory Saving

In [3]:
print('Memory usage by Data: ',df_data.memory_usage(deep=True),'\n \n Preety high Right ??   Dont worry we will Reduce it....\n') 

# convert "category " column to Categorial data to save memory space as they are repeated and in fixed no.
df_data['category'] = df_data['category'].astype('category')

# convert "id " and "category_num" column to int32 to save memory space as they are repeadted and in fixed no.
df_data[['id','category_num']] = df_data[['id','category_num']].astype('int32')

print('\n Lets see Memory usage again: \n \n ',df_data.memory_usage(deep=True),' \n  \n See!! Told you we gonna reduce it \n')

Memory usage by Data:  Index                 128
id                3124824
abstract        429522799
category         25487734
category_num      3124824
dtype: int64 
 
 Preety high Right ??   Dont worry we will Reduce it....


 Lets see Memory usage again: 
 
  Index                 128
id                1562412
abstract        429522799
category           795575
category_num      1562412
dtype: int64  
  
 See!! Told you we gonna reduce it 



# Cleaning

## Function to clean data

In [4]:
snowball = SnowballStemmer(language='english') # create a snowball stemmer object
STOPWORDS = set(stopwords.words('english')) # load the stopwords
url_pattern = re.compile(r'https?://\S+|www\.\S+') # regular expression for url

# Function to clean text 
def clean_text(text):

    text = url_pattern.sub(r'',text.replace('\n',' ')) # remove URLs and replace newline by space
    text = re.sub(r'\w*\d\w*', '', text) # remove words containing numbers
    text = re.sub('[^a-z\s]', ' ', text.lower()) #replace all non-alphabetical characters with space and lower case
    text = [word for word in text.split() if word not in STOPWORDS] #remove stopwords
    text = [snowball.stem(i) for i in text] # stemming
    text = ' '.join(text ) # join the list of words 
    text = re.sub(r'\b\w{1,2}\b', '', text) #remove words with length less than 2

    return text

## Clean Train data

In [5]:
print('\n \n Cleaning started for train.csv . Hold on ...............  it will take time 10-15 minutes \n \n')
df_data['abstract'] = df_data['abstract'].apply(clean_text)

print('\n \n Cleaning completed for train.csv now saving it as train_clean.sav in data folder \n \n')
df_data.to_csv('./data/train_clean.csv', index=False)
# df_data.to_csv('./test/train_clean.csv', index=False)
print('Lets verify if that function works : \n,',df_data.head(),'\n \n All looks good \n \n ') 


 
 Cleaning started for train.csv . Hold on it will take time 10-15 minutes 
 


 
 Cleaning completed for train.csv now saving it as train_clean.sav in data folder 
 

Lets verify if that function works : 
,        id                                           abstract     category  \
0  271675  bacteria often expos multipl stimuli complex e...     q-bio-QM   
1  412276  accur knowledg thermodynam properti zero tempe...      hep-ph-   
2  256956  largest solar flare solar cycl preced flare oc...  astro-ph-SR   
3  427612  say random integ variabl  monoton modulus char...      math-PR   
4  113852  deriv formula express joint distribut cyclic v...      math-CO   

   category_num  
0           138  
1            68  
2             7  
3            93  
4            76   
 
 All looks good 
 
 


## Clean Test data

In [6]:
print('\n \n Now cleaning test.csv  and saving it as test_clean.csv \n \n Hold on..............  it will take2-3 minutes \n \n')
df_test['abstract'] = df_test['abstract'].apply(clean_text)
df_test.to_csv('./data/test_clean.csv', index=False)
# df_test.to_csv('./test/test_clean.csv', index=False)
print('Lets verify if that function works : \n,',df_test.head(),'\n \n All looks good \n \n ') 



 
 Now cleaning test.csv  and saving it as test_clean.csv 
 
 Hold on..............  it will take2-3 minutes 
 

Lets verify if that function works : 
,        id                                           abstract
0  430065  depth map obtain commerci depth sensor alway l...
1   75226  lambda express introduc java program languag p...
2  301990  propos demonstr gamma gamma collid   gev  euro...
3  301001  physic lab student experi wide rang equit ineq...
4  280179  exist local minima one hidden layer relu netwo... 
 
 All looks good 
 
 
