Get the directory containing the article categories.

In [1]:
import pandas as pd
import nltk
import csv
import os

data_dir = os.path.join(os.getcwd(), "data")

In [2]:
data_categories = [f.path for f in os.scandir(data_dir) if f.is_dir()]

Count the total number of articles. Each number from that total number will serve as an ID for each article.

In [3]:
id = 0
with open('data.tsv', 'w') as dataset:
    tsv_output = csv.writer(dataset, delimiter='\t')
    tsv_output.writerow(['id','title','content','category'])
    for category_path in data_categories:
        category = os.path.basename(os.path.normpath(category_path))
        for file in os.listdir(category_path):
            if file.endswith('.txt') and os.path.isfile(os.path.join(category_path, file)):
                article_path = os.path.join(category_path,file)
                article = open(article_path, 'r', encoding = 'utf8', errors = 'ignore')
                
                title = article.readline()              
                content = article.read()
                article.close()

                tsv_output.writerow([id,title,content,category])
                id += 1
dataset.close()

In [4]:
df = pd.read_csv("data.tsv", sep='\t', encoding = 'ANSI')
df.head()

Unnamed: 0,id,title,content,category
0,0,Ad sales boost Time Warner profit\r\n,\r\nQuarterly profits at US media giant TimeWa...,business
1,1,Dollar gains on Greenspan speech\r\n,\r\nThe dollar has hit its highest level again...,business
2,2,Yukos unit buyer faces loan claim\r\n,\r\nThe owners of embattled Russian oil giant ...,business
3,3,High fuel prices hit BA's profits\r\n,\r\nBritish Airways has blamed high fuel price...,business
4,4,Pernod takeover talk lifts Domecq\r\n,\r\nShares in UK drinks and food firm Allied D...,business


##### Data preprocessing

In [5]:
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stop_words

Prototype of the function that is defined in the cell bellow.

<span style="color:DeepPink">**preprocess_article**</span>**(text)**  
&nbsp;&nbsp;Removes special characters from a given string object, removes stop words and lematizes words using WordNetLematizer().  
&nbsp;&nbsp;&nbsp;**Parameters: &nbsp;&nbsp;&nbsp;text : str**  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
String object to process. 

&nbsp;&nbsp;&nbsp;**Returns: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;text : str**  
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
Lematized string object without stopwords and several special characters.

In [6]:
stop_words = list(stop_words)

""" In previous version of our project, the wordclouds below showed that 'said' word
    appears the most in the data, so we decided to remove it as it has no valuable meaning. """
stop_words.append('said')
wordnet_lemmatizer = WordNetLemmatizer()

""" Make sure that the text parameter and return variable are of string type. """
def preprocess_article(text: str) -> str:
    # Remove newlines and \r characters.
    text = text.replace('\n', '')
    text = text.replace('\r', '')
    # Remove quotes
    text = text.replace('"', '')
    # Convert text to lowercase.
    text = text.lower()
    # Remove punctuation and special characters !?:;.,[]() .
    text = text.translate(str.maketrans('', '', '!?:;.,[]()'))
    # Remove terminating 's characters.
    text = text.replace("'s", "")
    # Remove stop words. Note: do this first and then lemmatize because lemmatizing
    # can change words like 'has' to 'ha'.
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lematize text with WordNetLemmatizer().
    text = ' '.join([wordnet_lemmatizer.lemmatize(word) for word in text.split(' ')])
    
    return text

Apply the function above to the title and content columns of the dataframe.

In [7]:
df['title'] = df.title.apply(preprocess_article)
df['content'] = df.content.apply(preprocess_article)
df.head()

Unnamed: 0,id,title,content,category
0,0,ad sale boost time warner profit,quarterly profit medium giant timewarner jumpe...,business
1,1,dollar gain greenspan speech,dollar hit highest level euro month federal re...,business
2,2,yukos unit buyer face loan claim,owner embattled russian oil giant yukos ask bu...,business
3,3,high fuel price hit ba profit,british airway blamed high fuel price 40% drop...,business
4,4,pernod takeover talk lift domecq,share uk drink food firm allied domecq risen s...,business


Now split the dataframe into train and test datasets and save them as .tsv files

In [8]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size = 0.2, stratify = df['category'])
test = test.drop('category', axis = 1)

train.to_csv('train_set.tsv', sep = '\t', index = False, header = False)
test.to_csv('test_set.tsv', sep = '\t', index = False, header = False)

print(train.shape)
print(test.shape)

(1780, 4)
(445, 3)
