# Cleaning News Data

In [1]:
import pandas as pd
import nltk
import re
import string
import pickle

### Read previously stored data

In [2]:
news_data = pd.read_pickle('pickle/news_data.pkl')
news_data = pd.DataFrame(news_data)
news_data

Unnamed: 0,title,content
0,House Republicans Fret About Winning Their Hea...,WASHINGTON — Congressional Republicans have...
1,Rift Between Officers and Residents as Killing...,"After the bullet shells get counted, the blood..."
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...","When Walt Disney’s “Bambi” opened in 1942, cri..."
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...","Death may be the great equalizer, but it isn’t..."
4,Kim Jong-un Says North Korea Is Preparing to T...,"SEOUL, South Korea — North Korea’s leader, ..."
5,"Sick With a Cold, Queen Elizabeth Misses New Y...","LONDON — Queen Elizabeth II, who has been b..."
6,Taiwan’s President Accuses China of Renewed In...,BEIJING — President Tsai of Taiwan sharpl...
7,"After ‘The Biggest Loser,’ Their Bodies Fought...","Danny Cahill stood, slightly dazed, in a blizz..."
8,"First, a Mixtape. Then a Romance. - The New Yo...","Just how is Hillary Kerr, the founder of ..."
9,Calling on Angels While Enduring the Trials of...,Angels are everywhere in the Muñiz family’s ap...


In [3]:
news_data.keys()

Index(['title', 'content'], dtype='object')

### Title and Content of First News Data (Raw Data)

In [4]:
# Title
news_data['title'][0]

'House Republicans Fret About Winning Their Health Care Suit - The New York Times'

In [5]:
# Content
news_data['content'][0]

'WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conservative voters who have been 

### Function to clean data

In [6]:
# Function to clean the data
def clean_data_func(text):
    text = text.lower() # to convert every alphabet to lower case
    text = re.sub('\[.*?\]', '', text) # to remove '\[.*?\]' from text
    text = re.sub('\w*\d\w*', '', text) # to remove '\w*\d\w*' from text
    text = re.sub('[‘’“”—■]', '', text) # to remove '[‘’“”]' from text
    text = re.sub('\n', '', text) # to remove newline from text
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # to remove punctuation from text
    return text

In [7]:
for i in range(50):
    news_data['title'][i] = clean_data_func(news_data['title'][i])
    news_data['content'][i] = clean_data_func(news_data['content'][i])
'data cleaned'

'data cleaned'

### Cleaned Data

In [8]:
news_data

Unnamed: 0,title,content
0,house republicans fret about winning their hea...,washington congressional republicans have ...
1,rift between officers and residents as killing...,after the bullet shells get counted the blood ...
2,tyrus wong bambi artist thwarted by racial bia...,when walt disneys bambi opened in critics pra...
3,among deaths in a heavy toll in pop music th...,death may be the great equalizer but it isnt n...
4,kim jongun says north korea is preparing to te...,seoul south korea north koreas leader kim ...
5,sick with a cold queen elizabeth misses new ye...,london queen elizabeth ii who has been bat...
6,taiwans president accuses china of renewed int...,beijing president tsai of taiwan sharply...
7,after the biggest loser their bodies fought to...,danny cahill stood slightly dazed in a blizzar...
8,first a mixtape then a romance the new york t...,just how is hillary kerr the founder of a...
9,calling on angels while enduring the trials of...,angels are everywhere in the muñiz familys apa...


### Title and Content of First News Data (Cleaned Data)

In [9]:
# Title
news_data['title'][0]

'house republicans fret about winning their health care suit  the new york times'

In [10]:
# Content
news_data['content'][0]

'washington     congressional republicans have a new fear when it comes to their    health care lawsuit against the obama administration they might win the incoming trump administration could choose to no longer defend the executive branch against the suit which challenges the administrations authority to spend billions of dollars on health insurance subsidies for   and   americans handing house republicans a big victory on    issues but a sudden loss of the disputed subsidies could conceivably cause the health care program to implode leaving millions of people without access to health insurance before republicans have prepared a replacement that could lead to chaos in the insurance market and spur a political backlash just as republicans gain full control of the government to stave off that outcome republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the obama health care law angering conservative voters who have been demanding an

### Save the DataFrame for future use

In [11]:
pickle.dump(news_data, open('pickle/clean_news_data.pkl', 'wb'))