In [3]:
import numpy as np
import resources.text_normalizer as tn
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

data_df = pd.read_csv('data/all_data.csv', index_col = 'Unnamed: 0')

In [4]:
data_df

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
11764,business,"During the day, the markets saw high volatilit..."
11765,business,Investors are awaiting economic growth data an...
11766,business,"The watchdog passed five separate orders, toge..."
11767,business,"The S&P 500 opened higher by 1.26 points, or 0..."


In [5]:
# how many empty documents are there?

total_nulls = data_df[data_df.text.str.strip() == ""].shape[0]
print("Empty documents:", total_nulls)

Empty documents: 0


In [6]:
import nltk
stopword_list = nltk.corpus.stopwords.words('english')
# just to keep negation if any in bi-grams
stopword_list.remove('no')
stopword_list.remove('not')

norm_corpus = tn.normalize_corpus(corpus=data_df['text'], html_stripping=True,
                                 contraction_expansion=True, accented_char_removal=True,
                                 text_lower_case=True, text_lemmatization=True,
                                 text_stemming=False, special_char_removal=True,
                                 remove_digits=True, stopword_removal=True,
                                 stopwords=stopword_list)
data_df['clean text'] = norm_corpus

In [7]:
data_df.head()

Unnamed: 0,category,text,clean text
0,tech,tv future in the hands of viewers with home th...,tv future hand viewer home theatre system plas...
1,business,worldcom boss left books alone former worldc...,worldcom boss left book alone former worldcom ...
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary farrell gamble leicester say not r...
3,sport,yeading face newcastle in fa cup premiership s...,yeade face newcastle fa cup premiership side n...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean twelve raid box office ocean twelve crim...


In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data_df['category'])

LabelEncoder()

In [9]:
le.classes_

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [10]:
data_df['category label'] = pd.Series(le.transform(data_df['category']))

In [11]:
data_df.to_csv('data/cleaned_all_data.csv', index=False)