### Step 1 - Data Cleaning

Open all the recipes data in one file and prepare them for further analysis!

In [1]:
# Import the necessary library!

import pandas as pd
import os
import numpy as np
import glob
import datetime

import nltk
import re
import string
from nltk.tokenize import RegexpTokenizer  
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize
print('package ready')

package ready


In [2]:
# List all the files in the folder!  
files = glob.glob('guardian/*.csv')
# read the files in a dataframe  
data_df = [pd.read_csv(f) for f in files]

In [3]:
# Append all the files together  
data = pd.concat(data_df,ignore_index=True)
# Reset the index  
data = data.reset_index()  
# Drop the duplicates  
data = data.drop_duplicates()

In [4]:
print("Number of data (recipes) : " + str(data.shape))
print()
data.head(2)

Number of data (recipes) : (7603, 11)



Unnamed: 0.1,index,Unnamed: 0,date,headline_list,category,author,subtitle,article_content,comments_number,share_number,url
0,0,0.0,2009-04-12T00:01:00+0100,My space,My space,Anna Chapman,The passionate entrepreneur Willy Harcourt-Coo...,"E\night years ago, my wife Tania and I came ba...",No Comments,0,https://www.theguardian.com/lifeandstyle/2009/...
1,1,1.0,2009-04-12T00:01:00+0100,Nigel Slater's Easter eggs,Nigel Slater recipes,Nigel Slater,Nothing comes with a trickier reputation to ma...,I\nt is fair to say the eggs I am most interes...,No Comments,No Sharing,https://www.theguardian.com/lifeandstyle/2009/...


In [5]:
# Drop the column that we don't need
data = data.drop(columns=['Unnamed: 0','index'])

In [6]:
# Look for null value in the data
print(data.isnull().sum())

date                 0
headline_list       28
category            35
author             349
subtitle           210
article_content      1
comments_number      3
share_number         3
url                  2
dtype: int64


In [7]:
# Make a bit of cleaning

# Delete all the article with category 'Family Life' - Not relevant for recipes analysis! 
data = data[data['category']!='Family life']

In [None]:
# Check at the name of the author
for i in data.author:
    print(i)

In [8]:
# Replace the author that have been wrongly webscraped!  
data = data.replace({'[<meta content="Dale Berning Sawa" name="author"/>]':'Dale Berning Sawa',  
                     '[<meta content="Chi-chi Nwanoku" name="author"/>]':'Chi-chi Nwanoku',  
                     '[<meta content="Hugh Fearnley-Whittingstall" name="author"/>]':'Hugh Fearnley-Whittingstall',  
                     '[<meta content="Jeanette Winterson" name="author"/>]':'Jeanette Winterson',  
                     '[<meta content="Matthew Fort" name="author"/>]':'Matthew Fort',  
                     '[<meta content="Susan McCarthy" name="author"/>]':'Susan McCarthy',  
                     '[<meta content="Ariane Sherine" name="author"/>]':'Ariane Sherine',  
                     '[<meta content="Ruby Tandoh" name="author"/>]':'Ruby Tandoh',  
                     '[<meta content="the Guardian" name="author"/>]':'the Guardian',  
                     '[<meta content="Romy Ash" name="author"/>, <meta content="Sarah Trotter" name="author"/>, <meta content="Romy Ash" name="author"/>, <meta content="Lauren Bamford" name="author"/>, <meta content="Sarah Trotter" name="author"/>]':'Romy Ash, Sara Trotter, Lauren Bamford',  
                     '[<meta content="Clem Bastow" name="author"/>]':'Clem Bastow',  
                     '[<meta content="Yotam Ottolenghi" name="author"/>, <meta content="Uyen Luu" name="author"/>, <meta content="David Frenkiel" name="author"/>, <meta content="Luise Vindahl" name="author"/>, <meta content="Caroline Craig" name="author"/>]':'Caroline Craig, Luise Vindahl, Yotam Ottolenghi, Uyen Luu, David Frenkiel',  
                     'Yotam Ottlenghi':'Yotam Ottolenghi','Yuki Sugiura, Valerie Berry, Lee Gould, Rachel Vere and Music by Evan Gildersleeve, theguardian.com':'Yuki Sugiura, Valerie Berry, Lee Gould, Rachel Vere',
                     'Hugh-Fearnley Whittingstall':'Hugh Fearnley-Whittingstall','Hugh Fearnley-Whittinstall':'Hugh Fearnley-Whittingstall'})

# If NaN for authors and NaN for subtitle -> change it to none!   
values = {'author': 'none', 'subtitle':'none'}  
data = data.fillna(value=values)

In [9]:
# Check for the no comments and no url!
test = data[data['comments_number'].isnull()]  
test.share_number = 'No Sharing'  
data.append(test)

change = data[(data['date'] == '2015-01-20') & (data['author']== 'Ruby Tandoh' )]  
change.share_number = 52  
change.url = 'https://www.theguardian.com/lifeandstyle/2015/jan/20/best-baking-recipes-from-2014-ruby-tandoh'  
change.comments_number = 25  
data.append(change)

print("Change ok!")

Change ok!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [10]:
# Remove the rows if NaN is in url, headline_list, share_number and category after having make the above change!  
data = data.dropna(subset = ['headline_list','url','share_number','category'])  

print('Number of rows (recipes), data clean: ' + str(data.shape))
print("")
print('Data Clean')

Number of rows (recipes), data clean: (7504, 9)

Data Clean


In [11]:
# Check the types of the data!
data.dtypes

date               object
headline_list      object
category           object
author             object
subtitle           object
article_content    object
comments_number    object
share_number       object
url                object
dtype: object

In [None]:
data.head(2)

Change the data date to date instead of object!

In [12]:
#Separe the date in year and month
# Remove the strings of time in date. Keep only the day and the month.b  
data.date = data.date.str.replace('T\d.*','')  
data['date'] = pd.to_datetime(data['date'])

# Add a column in dataframe for the year, month and day  
data['year'] = data['date'].dt.year  
data['month'] = data['date'].dt.month_name()  
data['day'] = data['date'].dt.day 

In [13]:
## Function for tokenization and cleaning text!   
cleaning = ['headline_list','subtitle','article_content']  
stop = stopwords.words('english')  

def cleandata(data):
        for i in cleaning:
                print(str(i))
                # Remove the punctuation  
                data['clean_'+str(i)] = data[i].apply(lambda x:x.translate(str.maketrans('','', string.punctuation)))  
                print('step 1: punctuation remove')

                # Remove the '\\n' line_brakes  
                data['clean_'+str(i)] = data['clean_'+str(i)].str.replace(r'\n',' ')
                # Get all the words in lowercase  
                data['clean_'+str(i)] = data['clean_'+str(i)].str.lower()  
                if i == 'article_content':
                    data['clean_'+str(i)] = data['clean_'+str(i)].str.replace(r'Since you’re here.*','')
                print('step 2: words in lowercase')

                # Remove all the english stopwords!  
                data['nostop_words_'+'clean_'+str(i)] = data['clean_'+str(i)].apply(  
                lambda x: ' '.join([word for word in x.split() if word not in (stop)]))  
                # Remove the special characters!  
                data['nostop_words_'+'clean_'+str(i)] = data['nostop_words_'+'clean_'+str(i)].str.replace(r"\’|\s\–|\'",'')                                                                                    
                data['nostop_words_'+'clean_'+str(i)] = data['nostop_words_'+'clean_'+str(i)].str.replace(r'[0-9]+', '')
                
                print('step 3: nonstopwords and special characters removed')

                # Tokenize the text  
                data['token_'+str(i)] = data['nostop_words_'+'clean_'+str(i)].apply(word_tokenize)
                print('step 4: words tokenize')
                print('Data Clean Finish')
                print("")
                
              #data['clean_'+str(i)] = data['clean_'+str(i)].str.replace(guardian1,'')
                
        return

cleandata(data)

headline_list
step 1: punctuation remove
step 2: words in lowercase
step 3: nonstopwords and special characters removed
step 4: words tokenize
Data Clean Finish

subtitle
step 1: punctuation remove
step 2: words in lowercase
step 3: nonstopwords and special characters removed
step 4: words tokenize
Data Clean Finish

article_content
step 1: punctuation remove
step 2: words in lowercase
step 3: nonstopwords and special characters removed
step 4: words tokenize
Data Clean Finish



In [14]:
data.head(2)

Unnamed: 0,date,headline_list,category,author,subtitle,article_content,comments_number,share_number,url,year,...,day,clean_headline_list,nostop_words_clean_headline_list,token_headline_list,clean_subtitle,nostop_words_clean_subtitle,token_subtitle,clean_article_content,nostop_words_clean_article_content,token_article_content
0,2009-04-12,My space,My space,Anna Chapman,The passionate entrepreneur Willy Harcourt-Coo...,"E\night years ago, my wife Tania and I came ba...",No Comments,0,https://www.theguardian.com/lifeandstyle/2009/...,2009,...,12,my space,space,[space],the passionate entrepreneur willy harcourtcooz...,passionate entrepreneur willy harcourtcooze op...,"[passionate, entrepreneur, willy, harcourtcooz...",e ight years ago my wife tania and i came back...,e ight years ago wife tania came back venezuel...,"[e, ight, years, ago, wife, tania, came, back,..."
1,2009-04-12,Nigel Slater's Easter eggs,Nigel Slater recipes,Nigel Slater,Nothing comes with a trickier reputation to ma...,I\nt is fair to say the eggs I am most interes...,No Comments,No Sharing,https://www.theguardian.com/lifeandstyle/2009/...,2009,...,12,nigel slaters easter eggs,nigel slaters easter eggs,"[nigel, slaters, easter, eggs]",nothing comes with a trickier reputation to ma...,nothing comes trickier reputation make soufflé...,"[nothing, comes, trickier, reputation, make, s...",i t is fair to say the eggs i am most interest...,fair say eggs interested come weekend wrapped ...,"[fair, say, eggs, interested, come, weekend, w..."


In [15]:
# Save the Clean Text in a CSV file
data.to_csv('guardian_clean_data.csv')
print('data clean and save in csv file')

data clean and save in csv file
