In [6]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import text_preprocessor as tp

In [7]:
file_path = 'pickles/dataset.pickle'
with open(file_path, 'rb') as data:
    df = pickle.load(data)

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,contents,category,News_length
0,0,Kilroy unveils immigration policy\n\nEx-chatsh...,politics,957
1,1,'Debate needed' on donations cap\n\nA cap on d...,politics,3001
2,2,Child access laws shake-up\n\nParents who refu...,politics,3536
3,3,"School sport 'is back', says PM\n\nTony Blair ...",politics,2410
4,4,Pre-poll clash on tax and spend\n\nLabour and ...,politics,2637


In [9]:
df.loc[1]['contents']

'\'Debate needed\' on donations cap\n\nA cap on donations to political parties should not be introduced yet, the elections watchdog has said.\n\nFears that big donors can buy political favours have sparked calls for a limit. In a new report, the Electoral Commission says it is worth debating a £10,000 cap for the future but now is not the right time to introduce it. It also says there should be more state funding for political parties and candidates should be able to spend more on election campaigning.\n\nThere were almost £68m in reported donations to political parties in 2001, 2002 and 2003, with nearly £12m of them from individual gifts worth more than £1m. The rules have already been changed so the public can see who gives how much to the parties but the report says there are still public suspicions. The commission says capping donations would mean taxpayers giving parties more cash - something which would first have to be acceptable to the public and shown to work. "While we are n

### Text Cleaning

In [10]:
df['cleaned_txt'] = df['contents'].str.lower()

In [11]:
df['cleaned_txt'] = df['cleaned_txt'].apply(tp.remove_html_tags)

In [12]:
df['cleaned_txt'] = df['cleaned_txt'].apply(tp.remove_accented_chars)

In [13]:
df['cleaned_txt'] = df['cleaned_txt'].apply(tp.remove_special_chars, remove_digits=True)

In [14]:
df['cleaned_txt'] = df['cleaned_txt'].apply(tp.lemmatize_text)

In [15]:
df.head()

Unnamed: 0.1,Unnamed: 0,contents,category,News_length,cleaned_txt
0,0,Kilroy unveils immigration policy\n\nEx-chatsh...,politics,957,kilroy unveil immigration policy \n\n exchatsh...
1,1,'Debate needed' on donations cap\n\nA cap on d...,politics,3001,debate need on donation cap \n\n a cap on dona...
2,2,Child access laws shake-up\n\nParents who refu...,politics,3536,child access law shakeup \n\n parent who refus...
3,3,"School sport 'is back', says PM\n\nTony Blair ...",politics,2410,school sport be back say pm \n\n tony blair ha...
4,4,Pre-poll clash on tax and spend\n\nLabour and ...,politics,2637,prepoll clash on tax and spend \n\n labour and...


In [16]:
df.loc[1]['cleaned_txt']

'debate need on donation cap \n\n a cap on donation to political party should not be introduce yet the election watchdog have say \n\n fear that big donor can buy political favour have spark call for a limit in a new report the electoral commission say it be worth debate a   cap for the future but now be not the right time to introduce it it also say there should be more state funding for political party and candidate should be able to spend more on election campaigning \n\n there be almost m in report donation to political party in    and   with nearly m of them from individual gift worth more than m the rule have already be change so the public can see who give how much to the party but the report say there be still public suspicion the commission say capping donation would mean taxpayer give party more cash   something which would first have to be acceptable to the public and show to work while we be not in principle oppose to the introduction of a donation cap we do not believe tha

In [17]:
df['cleaned_txt'] = df['cleaned_txt'].str.replace('\n', ' ')
df['cleaned_txt'] = df['cleaned_txt'].str.replace('\r', ' ')

In [18]:
df.loc[1]['cleaned_txt']

'debate need on donation cap    a cap on donation to political party should not be introduce yet the election watchdog have say    fear that big donor can buy political favour have spark call for a limit in a new report the electoral commission say it be worth debate a   cap for the future but now be not the right time to introduce it it also say there should be more state funding for political party and candidate should be able to spend more on election campaigning    there be almost m in report donation to political party in    and   with nearly m of them from individual gift worth more than m the rule have already be change so the public can see who give how much to the party but the report say there be still public suspicion the commission say capping donation would mean taxpayer give party more cash   something which would first have to be acceptable to the public and show to work while we be not in principle oppose to the introduction of a donation cap we do not believe that such

In [19]:
df['cleaned_txt'] = df['cleaned_txt'].apply(tp.remove_stopwords)

In [20]:
df.loc[1]['cleaned_txt']

'debate need donation cap cap donation political party introduce yet election watchdog say fear big donor buy political favour spark call limit new report electoral commission say worth debate cap future right time introduce also say state funding political party candidate able spend election campaigning almost report donation political party nearly individual gift worth rule already change public see give much party report say still public suspicion commission say capping donation would mean taxpayer give party cash something would first acceptable public show work principle oppose introduction donation cap believe major departure exist system would sensible say report cap small enough amount make difference would ban donation even without change commission urge political party seek smallscale donation suggest income tax relief gift also suggest increase state funding party help extend party least two member house commons european parliament scottish parliament welsh assembly northern

### Label Encoding

In [21]:
df['category'].unique()

array(['politics', 'sport', 'business', 'tech', 'entertainment'],
      dtype=object)

In [22]:
category_codes = {
    'business':0,
    'entertainment':1,
    'sport':2,
    'politics':3,
    'tech':4
}

In [23]:
df['category_code'] = df['category']
df = df.replace({'category_code':category_codes})

In [24]:
df.head()

Unnamed: 0.1,Unnamed: 0,contents,category,News_length,cleaned_txt,category_code
0,0,Kilroy unveils immigration policy\n\nEx-chatsh...,politics,957,kilroy unveil immigration policy exchatshow ho...,3
1,1,'Debate needed' on donations cap\n\nA cap on d...,politics,3001,debate need donation cap cap donation politica...,3
2,2,Child access laws shake-up\n\nParents who refu...,politics,3536,child access law shakeup parent refuse allow f...,3
3,3,"School sport 'is back', says PM\n\nTony Blair ...",politics,2410,school sport back say pm tony blair promise sp...,3
4,4,Pre-poll clash on tax and spend\n\nLabour and ...,politics,2637,prepoll clash tax spend labour tory clash tax ...,3


### Split dataset into training and testing sets

Sklearn documentation https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(df['cleaned_txt'], df['category_code'], 
                                                    test_size = 0.2, random_state=8)

In [26]:
X_train.shape

(1779,)

In [27]:
X_test.shape


(445,)

### TF-IDF Vectors

Sklearn TfidfVectorizer documentation
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [28]:
tfidf = TfidfVectorizer(encoding='utf-8',
                       ngram_range=(1,2),
                       lowercase=False,
                       max_df=1.0,
                       min_df=10,
                       max_features=300,
                       norm='l2',
                       sublinear_tf=True)

In [29]:
train_features = tfidf.fit_transform(X_train).toarray()
train_labels = Y_train
print(train_features.shape)

(1779, 300)


In [30]:
test_features = tfidf.transform(X_test).toarray()
test_labels = Y_test
print(test_features.shape)

(445, 300)


In [31]:
train_features

array([[0.        , 0.        , 0.        , ..., 0.08569788, 0.07982272,
        0.16548335],
       [0.        , 0.        , 0.        , ..., 0.08514499, 0.07930774,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.08057906,
        0.16705136],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.07167546,
        0.14859298],
       [0.        , 0.        , 0.        , ..., 0.05963794, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.19022694, 0.        ,
        0.        ]])

Save the objects in files

In [32]:
# cleaned dataframe
with open('pickles/cleaned_df.pickle', 'wb') as output:
    pickle.dump(df, output)

In [33]:
#X_train
with open('pickles/X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)

In [34]:
#X_test
with open('pickles/X_test.pickle', 'wb') as output:
    pickle.dump(X_test, output)

In [35]:
#Y_train
with open('pickles/Y_train.pickle', 'wb') as output:
    pickle.dump(Y_train, output)

In [36]:
#Y_test
with open('pickles/Y_test.pickle', 'wb') as output:
    pickle.dump(Y_test, output)

In [37]:
#train_features
with open('pickles/train_features.pickle', 'wb') as output:
    pickle.dump(train_features, output)

In [38]:
#test_features
with open('pickles/test_features.pickle', 'wb') as output:
    pickle.dump(test_features, output)

In [39]:
#train_labels
with open('pickles/train_labels.pickle', 'wb') as output:
    pickle.dump(train_labels, output)

In [40]:
#test_labels
with open('pickles/test_labels.pickle', 'wb') as output:
    pickle.dump(test_labels, output)

In [41]:
#tfidf
with open('pickles/tfidf.pickle', 'wb') as output:
    pickle.dump(tfidf, output)