In [49]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import text_preprocessor as tp

In [5]:
file_path = 'pickles/dataset.pickle'
with open(file_path, 'rb') as data:
    df = pickle.load(data)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,contents,category,News_length
0,0,Chinese exports rise 25% in 2004\n\nExports fr...,business,1784
1,1,S Korea spending boost to economy\n\nSouth Kor...,business,2600
2,2,Bush budget seeks deep cutbacks\n\nPresident B...,business,3651
3,3,MG Rover China tie-up 'delayed'\n\nMG Rover's ...,business,1697
4,4,French suitor holds LSE meeting\n\nEuropean st...,business,2450


In [21]:
df.loc[1]['contents']

'S Korea spending boost to economy\n\nSouth Korea will boost state spending next year in an effort to create jobs and kick start its sputtering economy.\n\nIt has earmarked 100 trillion won ($96bn) for the first six months of 2005, 60% of its total annual budget. The government\'s main problems are "slumping consumption and a contraction in the construction industry". It aims to create 400,000 jobs and will focus on infrastructure and home building, as well as providing public firms with money to hire new workers.\n\nThe government has set an economic growth rate target of 5% for next year and hinted that would be in danger unless it took action. "Internal and external economic conditions are likely to remain unfavourable in 2005," the Finance and Economy Ministry said in a statement.\n\nIt blamed "continuing uncertainties such as fluctuating oil prices and foreign exchange rates and stagnant domestic demand that has shown few signs of a quick rebound". In 2004, growth will be between 

### Text Cleaning

In [23]:
df['cleaned_txt'] = df['contents'].str.lower()

In [24]:
df['cleaned_txt'] = df['cleaned_txt'].apply(tp.remove_html_tags)

In [25]:
df['cleaned_txt'] = df['cleaned_txt'].apply(tp.remove_accented_chars)

In [31]:
df['cleaned_txt'] = df['cleaned_txt'].apply(tp.remove_special_chars, remove_digits=True)

In [33]:
df['cleaned_txt'] = df['cleaned_txt'].apply(tp.lemmatize_text)

In [34]:
df.head()

Unnamed: 0.1,Unnamed: 0,contents,category,News_length,cleaned_txt
0,0,Chinese exports rise 25% in 2004\n\nExports fr...,business,1784,chinese export rise in \n\n export from chin...
1,1,S Korea spending boost to economy\n\nSouth Kor...,business,2600,s korea spending boost to economy \n\n south k...
2,2,Bush budget seeks deep cutbacks\n\nPresident B...,business,3651,bush budget seek deep cutback \n\n president b...
3,3,MG Rover China tie-up 'delayed'\n\nMG Rover's ...,business,1697,mg rover china tieup delay \n\n mg rover propo...
4,4,French suitor holds LSE meeting\n\nEuropean st...,business,2450,french suitor hold lse meet \n\n european stoc...


In [37]:
df.loc[1]['cleaned_txt']

's korea spending boost to economy \n\n south korea will boost state spending next year in an effort to create job and kick start its sputter economy \n\n it have earmark   trillion win bn for the first six month of    of its total annual budget the government main problem be slump consumption and a contraction in the construction industry it aim to create   job and will focus on infrastructure and home building as well as provide public firm with money to hire new worker \n\n the government have set an economic growth rate target of   for next year and hint that would be in danger unless it take action internal and external economic condition be likely to remain unfavourable in   the finance and economy ministry say in a statement \n\n it blame continue uncertainty such as fluctuate oil price and foreign exchange rate and stagnant domestic demand that have show few sign of a quick rebound in   growth will be between   and   the ministry say not everyone be convince the plan will work 

In [40]:
df['cleaned_txt'] = df['cleaned_txt'].str.replace('\n', ' ')
df['cleaned_txt'] = df['cleaned_txt'].str.replace('\r', ' ')

In [41]:
df.loc[1]['cleaned_txt']

's korea spending boost to economy    south korea will boost state spending next year in an effort to create job and kick start its sputter economy    it have earmark   trillion win bn for the first six month of    of its total annual budget the government main problem be slump consumption and a contraction in the construction industry it aim to create   job and will focus on infrastructure and home building as well as provide public firm with money to hire new worker    the government have set an economic growth rate target of   for next year and hint that would be in danger unless it take action internal and external economic condition be likely to remain unfavourable in   the finance and economy ministry say in a statement    it blame continue uncertainty such as fluctuate oil price and foreign exchange rate and stagnant domestic demand that have show few sign of a quick rebound in   growth will be between   and   the ministry say not everyone be convince the plan will work our prim

In [42]:
df['cleaned_txt'] = df['cleaned_txt'].apply(tp.remove_stopwords)

In [44]:
df.loc[1]['cleaned_txt']

'korea spending boost economy south korea boost state spending next year effort create job kick start sputter economy earmark trillion win bn first six month total annual budget government main problem slump consumption contraction construction industry aim create job focus infrastructure home building well provide public firm money hire new worker government set economic growth rate target next year hint would danger unless take action internal external economic condition likely remain unfavourable finance economy ministry say statement blame continue uncertainty fluctuate oil price foreign exchange rate stagnant domestic demand show sign quick rebound growth ministry say everyone convince plan work primary worry centre believe government overly optimistic view front loading budget enough turn economy around consultancy cast say report problem face south korea many consumer reel effect credit bubble recently burst million south koreans default credit card bill country big card lender 

### Label Encoding

In [45]:
df['category'].unique()

array(['business', 'entertainment', 'sport', 'politics', 'tech'],
      dtype=object)

In [46]:
category_codes = {
    'business':0,
    'entertainment':1,
    'sport':2,
    'politics':3,
    'tech':4
}

In [47]:
df['category_code'] = df['category']
df = df.replace({'category_code':category_codes})

In [48]:
df.head()

Unnamed: 0.1,Unnamed: 0,contents,category,News_length,cleaned_txt,category_code
0,0,Chinese exports rise 25% in 2004\n\nExports fr...,business,1784,chinese export rise export china leapt previou...,0
1,1,S Korea spending boost to economy\n\nSouth Kor...,business,2600,korea spending boost economy south korea boost...,0
2,2,Bush budget seeks deep cutbacks\n\nPresident B...,business,3651,bush budget seek deep cutback president bush p...,0
3,3,MG Rover China tie-up 'delayed'\n\nMG Rover's ...,business,1697,mg rover china tieup delay mg rover propose ti...,0
4,4,French suitor holds LSE meeting\n\nEuropean st...,business,2450,french suitor hold lse meet european stock mar...,0


In [50]:
X_train, X_test, Y_train, Y_test = train_test_split(df['cleaned_txt'], df['category_code'], 
                                                    test_size = 0.2, random_state=8)

In [54]:
X_train.shape

(1779,)

In [58]:
X_test.shape


(445,)

### TF-IDF Vectors

In [59]:
tfidf = TfidfVectorizer(encoding='utf-8',
                       ngram_range=(1,2),
                       lowercase=False,
                       max_df=1.0,
                       min_df=10,
                       max_features=300,
                       norm='l2',
                       sublinear_tf=True)

In [65]:
train_features = tfidf.fit_transform(X_train).toarray()
train_labels = Y_train
print(train_features.shape)

(1779, 300)


In [66]:
test_features = tfidf.transform(X_test).toarray()
test_labels = Y_test
print(test_features.shape)

(445, 300)


In [68]:
train_features

array([[0.        , 0.        , 0.        , ..., 0.        , 0.06910465,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.15695373, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.16056925, 0.06280318,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.09878994,
        0.        ],
       [0.        , 0.08900973, 0.        , ..., 0.10556865, 0.04695112,
        0.        ]])

Save the objects in files

In [70]:
#X_train
with open('pickles/X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)

In [71]:
#X_test
with open('pickles/X_test.pickle', 'wb') as output:
    pickle.dump(X_test, output)

In [73]:
#Y_train
with open('pickles/Y_train.pickle', 'wb') as output:
    pickle.dump(Y_train, output)

In [74]:
#Y_test
with open('pickles/Y_test.pickle', 'wb') as output:
    pickle.dump(Y_test, output)

In [79]:
#train_features
with open('pickles/train_features.pickle', 'wb') as output:
    pickle.dump(train_features, output)

In [80]:
#test_features
with open('pickles/test_features.pickle', 'wb') as output:
    pickle.dump(test_features, output)

In [81]:
#train_labels
with open('pickles/train_labels.pickle', 'wb') as output:
    pickle.dump(train_labels, output)

In [82]:
#test_labels
with open('pickles/test_labels.pickle', 'wb') as output:
    pickle.dump(test_labels, output)

In [83]:
#tfidf
with open('pickles/tfidf.pickle', 'wb') as output:
    pickle.dump(tfidf, output)