In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords

In [2]:
train_set = pd.read_csv("dataset_train.csv")
test_set = pd.read_csv("dataset_test.csv")

In [3]:
train_set = train_set.astype({"Class Index": int, "Description": str})
test_set = test_set.astype({"Class Index": int, "Description": str})

In [4]:
train_set.head()

Unnamed: 0,Class Index,Description
0,1,BEIJING - Former President Jiang Zemin turned ...
1,1,THE sheer bastardry displayed by militants who...
2,1,But the downside to Senator John Kerry's delib...
3,1,"Dhaka, Aug 29: Bangladeshs main Opposition par..."
4,1,AFP - NATO and Russia vowed to boost cooperati...


In [5]:
print(len(train_set))
print(len(test_set))

50000
5000


In [11]:
# Separating input and label
train_x=train_set["Description"]
test_x=test_set["Description"]

train_y=train_set["Class Index"]
test_y=test_set["Class Index"]

### Pre-processing

In [10]:
def preprocess(text):
    stop_words = stopwords.words('english')
    lemmatizer=nltk.stem.WordNetLemmatizer()
    
    # tokenazation
    tokens_list = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            tokens_list.append(word)
    
    output = []
    for Token in tokens_list:
        # to lowercase
        token=Token.lower()
        
        # punctuation removal
        for punc in string.punctuation:
            token=token.replace(punc,'')
            
        # number digits removal
        for digit in string.digits:
            token=token.replace(digit,'')
            
        # lemmatzation
        token = lemmatizer.lemmatize(token)
        
        # stop words removal  
        if (token != "") and (token not in stop_words):
            output.append(token)
            
    return output

In [12]:
%%time
test_x=test_x.apply(preprocess)

CPU times: user 27.3 s, sys: 169 ms, total: 27.5 s
Wall time: 27.5 s


In [8]:
%%time
train_x=train_x.apply(preprocess)

CPU times: user 2min 35s, sys: 1.16 s, total: 2min 36s
Wall time: 2min 37s


In [9]:
# list to str
train_x=train_x.apply(lambda x: ''.join(i+' ' for i in x))
test_x=test_x.apply(lambda x: ''.join(i+' '  for i in x))

### Save pre-processed data

In [10]:
train_set_pp = pd.DataFrame(columns=["Class Index", "Description"])
test_set_pp = pd.DataFrame(columns=["Class Index", "Description"])

In [11]:
train_set_pp["Class Index"] = train_y
train_set_pp["Description"] = train_x

In [12]:
test_set_pp["Class Index"] = test_y
test_set_pp["Description"] = test_x

In [13]:
train_set_pp.to_csv("dataset_train_pp.csv")

In [14]:
test_set_pp.to_csv("dataset_test_pp.csv")

In [15]:
train_set_pp.head()

Unnamed: 0,Class Index,Description
0,1,beijing former president jiang zemin turned la...
1,1,sheer bastardry displayed militant kidnapped a...
2,1,downside senator john kerrys deliberative styl...
3,1,dhaka aug bangladesh main opposition party ha ...
4,1,afp nato russia vowed boost cooperation fighti...
