In [1]:
# Importing required libraries

import pickle
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
import numpy as np

In [2]:
# Reading Data

df_path2 = "News_dataset.pickle"
with open(df_path2, 'rb') as data:
    df = pickle.load(data)

In [3]:
df.head()

Unnamed: 0,File_Name,Content,Category,Complete_Filename,id,News_length
0,001.txt,Ad sales boost Time Warner profit\r\n\r\nQuart...,business,001.txt-business,1,2569
1,002.txt,Dollar gains on Greenspan speech\r\n\r\nThe do...,business,002.txt-business,1,2257
2,003.txt,Yukos unit buyer faces loan claim\r\n\r\nThe o...,business,003.txt-business,1,1557
3,004.txt,High fuel prices hit BA's profits\r\n\r\nBriti...,business,004.txt-business,1,2421
4,005.txt,Pernod takeover talk lifts Domecq\r\n\r\nShare...,business,005.txt-business,1,1575


In [4]:
df.loc[1]['Content']

'Dollar gains on Greenspan speech\r\n\r\nThe dollar has hit its highest level against the euro in almost three months after the Federal Reserve head said the US trade deficit is set to stabilise.\r\n\r\nAnd Alan Greenspan highlighted the US government\'s willingness to curb spending and rising household savings as factors which may help to reduce it. In late trading in New York, the dollar reached $1.2871 against the euro, from $1.2974 on Thursday. Market concerns about the deficit has hit the greenback in recent months. On Friday, Federal Reserve chairman Mr Greenspan\'s speech in London ahead of the meeting of G7 finance ministers sent the dollar higher after it had earlier tumbled on the back of worse-than-expected US jobs data. "I think the chairman\'s taking a much more sanguine view on the current account deficit than he\'s taken for some time," said Robert Sinche, head of currency strategy at Bank of America in New York. "He\'s taking a longer-term view, laying out a set of cond

In [5]:
# 1. Text Cleaning

In [6]:
# 1.1 Special Characters Cleaning:

In [7]:
# \r and \n

df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
df['Content_Parsed_1'] = df['Content'].str.replace("\n", " ")
df['Content_Parsed_1'] = df['Content'].str.replace("    ", " ")


In [8]:
text = "Mr Greespan\'s"
text

"Mr Greespan's"

In [9]:
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace('"', '')

In [10]:
# 1.2 Upcase / Downcase

In [11]:
# Lowercasing 
df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()

In [12]:
# 1.3 Punctuation Signs

In [13]:
punctuation_signs = list("?:!.,;")
df['Content_Parsed_3'] = df['Content_Parsed_2']

for punct_sign in punctuation_signs:
    df['Content_Parsed_3'] = df['Content_Parsed_2'].str.replace(punct_sign, '')

In [14]:
# 1.4 Possesive Pronoun

In [15]:
df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")

In [16]:
# 1.5. Stemming and Lemmatization

In [17]:
# Saving the lemmatizer into an object
wordnet_lemmatizer = WordNetLemmatizer()

In [18]:
nrows = len(df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['Content_Parsed_4']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

In [19]:
df['Content_Parsed_5'] = lemmatized_text_list

In [20]:
# 1.6 Stop Words

In [21]:
stop_words = list(stopwords.words('english'))

In [22]:
example = "me eating a meal"
word = "me"

# The regular expression is:
regex = r"\b" + word + r"\b"  # we need to build it like that to work properly

re.sub(regex, "StopWord", example)

'StopWord eating a meal'

In [23]:
df['Content_Parsed_6'] = df['Content_Parsed_5']

for stop_word in stop_words:

    regex_stopword = r"\b" + stop_word + r"\b"
    df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')

In [24]:
# Original Content

df.loc[5]['Content']

'Japan narrowly escapes recession\r\n\r\nJapan\'s economy teetered on the brink of a technical recession in the three months to September, figures show.\r\n\r\nRevised figures indicated growth of just 0.1% - and a similar-sized contraction in the previous quarter. On an annual basis, the data suggests annual growth of just 0.2%, suggesting a much more hesitant recovery than had previously been thought. A common technical definition of a recession is two successive quarters of negative growth.\r\n\r\nThe government was keen to play down the worrying implications of the data. "I maintain the view that Japan\'s economy remains in a minor adjustment phase in an upward climb, and we will monitor developments carefully," said economy minister Heizo Takenaka. But in the face of the strengthening yen making exports less competitive and indications of weakening economic conditions ahead, observers were less sanguine. "It\'s painting a picture of a recovery... much patchier than previously thoug

In [25]:
# 1st Iteration
df.loc[5]['Content_Parsed_1']

"Japan narrowly escapes recession\r\n\r\nJapan's economy teetered on the brink of a technical recession in the three months to September, figures show.\r\n\r\nRevised figures indicated growth of just 0.1% - and a similar-sized contraction in the previous quarter. On an annual basis, the data suggests annual growth of just 0.2%, suggesting a much more hesitant recovery than had previously been thought. A common technical definition of a recession is two successive quarters of negative growth.\r\n\r\nThe government was keen to play down the worrying implications of the data. I maintain the view that Japan's economy remains in a minor adjustment phase in an upward climb, and we will monitor developments carefully, said economy minister Heizo Takenaka. But in the face of the strengthening yen making exports less competitive and indications of weakening economic conditions ahead, observers were less sanguine. It's painting a picture of a recovery... much patchier than previously thought, sa

In [26]:
# 6th Iteration
df.loc[5]['Content_Parsed_6']

'japan narrowly escape recession\r\n\r\njapan economy teeter   brink   technical recession   three months  september, figure show.\r\n\r\nrevised figure indicate growth   0.1% -   similar-sized contraction   previous quarter.   annual basis,  data suggest annual growth   0.2%, suggest  much  hesitant recovery   previously  thought.  common technical definition   recession  two successive quarter  negative growth.\r\n\r\n government  keen  play   worry implications   data.  maintain  view  japan economy remain   minor adjustment phase   upward climb,    monitor developments carefully, say economy minister heizo takenaka.    face   strengthen yen make export less competitive  indications  weaken economic condition ahead, observers  less sanguine.  paint  picture   recovery... much patchier  previously thought, say paul sheard, economist  lehman brothers  tokyo. improvements   job market apparently  yet  fee   domestic demand,  private consumption   0.2%   third quarter.'

In [27]:
df.head(1)

Unnamed: 0,File_Name,Content,Category,Complete_Filename,id,News_length,Content_Parsed_1,Content_Parsed_2,Content_Parsed_3,Content_Parsed_4,Content_Parsed_5,Content_Parsed_6
0,001.txt,Ad sales boost Time Warner profit\r\n\r\nQuart...,business,001.txt-business,1,2569,Ad sales boost Time Warner profit\r\n\r\nQuart...,ad sales boost time warner profit\r\n\r\nquart...,ad sales boost time warner profit\r\n\r\nquart...,ad sales boost time warner profit\r\n\r\nquart...,ad sales boost time warner profit\r\n\r\nquart...,ad sales boost time warner profit\r\n\r\nquart...


In [28]:
# 2. Modifying Columns.

list_columns = ["File_Name", "Category", "Complete_Filename", "Content", "Content_Parsed_6"]
df = df[list_columns]

df = df.rename(columns={'Content_Parsed_6': 'Content_Parsed'})

In [29]:
df.head()

Unnamed: 0,File_Name,Category,Complete_Filename,Content,Content_Parsed
0,001.txt,business,001.txt-business,Ad sales boost Time Warner profit\r\n\r\nQuart...,ad sales boost time warner profit\r\n\r\nquart...
1,002.txt,business,002.txt-business,Dollar gains on Greenspan speech\r\n\r\nThe do...,dollar gain greenspan speech\r\n\r\n dollar ...
2,003.txt,business,003.txt-business,Yukos unit buyer faces loan claim\r\n\r\nThe o...,yukos unit buyer face loan claim\r\n\r\n owner...
3,004.txt,business,004.txt-business,High fuel prices hit BA's profits\r\n\r\nBriti...,high fuel price hit ba profits\r\n\r\nbritish ...
4,005.txt,business,005.txt-business,Pernod takeover talk lifts Domecq\r\n\r\nShare...,pernod takeover talk lift domecq\r\n\r\nshares...


In [30]:
# Label Coding

In [31]:
category_codes = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4
}

In [32]:
# Category Mapping

In [33]:
df['Category_Code'] = df['Category']
df = df.replace({'Category_Code': category_codes})

In [34]:
df.head()

Unnamed: 0,File_Name,Category,Complete_Filename,Content,Content_Parsed,Category_Code
0,001.txt,business,001.txt-business,Ad sales boost Time Warner profit\r\n\r\nQuart...,ad sales boost time warner profit\r\n\r\nquart...,0
1,002.txt,business,002.txt-business,Dollar gains on Greenspan speech\r\n\r\nThe do...,dollar gain greenspan speech\r\n\r\n dollar ...,0
2,003.txt,business,003.txt-business,Yukos unit buyer faces loan claim\r\n\r\nThe o...,yukos unit buyer face loan claim\r\n\r\n owner...,0
3,004.txt,business,004.txt-business,High fuel prices hit BA's profits\r\n\r\nBriti...,high fuel price hit ba profits\r\n\r\nbritish ...,0
4,005.txt,business,005.txt-business,Pernod takeover talk lifts Domecq\r\n\r\nShare...,pernod takeover talk lift domecq\r\n\r\nshares...,0


In [35]:
# 3. Train Test Split

In [36]:
X_train, X_test, y_train, y_test = train_test_split(df['Content_Parsed'], 
                                                    df['Category_Code'], 
                                                    test_size=0.20, 
                                                    random_state=8)


In [37]:
# 4. Text Representation

In [38]:
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

In [39]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True
)

features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(1780, 300)
(445, 300)


In [40]:
from sklearn.feature_selection import chi2
import numpy as np

for Product, category_id in sorted(category_codes.items()):
    features_chi2 = chi2(features_train, labels_train==category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' '))==1]
    print("# '{}' category:".format(Product))
    print(" .Most correlated keywords:\n. {}".format('\n '.join(unigrams[-5:])))
    print("")
    
    

# 'business' category:
 .Most correlated keywords:
. economic
 market
 economy
 growth
 bank

# 'entertainment' category:
 .Most correlated keywords:
. music
 best
 award
 star
 film

# 'politics' category:
 .Most correlated keywords:
. minister
 blair
 election
 party
 labour

# 'sport' category:
 .Most correlated keywords:
. win
 side
 game
 team
 match

# 'tech' category:
 .Most correlated keywords:
. digital
 software
 computer
 technology
 users

