In [29]:
import requests
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize
import language_check

In [30]:
def get_sentences(url, headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}):
    r = requests.get(url, headers=headers)
    # make sure that the page exist
    if r.status_code == 200:
        html = r.content
        try:
            dammit = UnicodeDammit(html)
            #print(dammit.unicode_markup)
            encoding = dammit.original_encoding
            soup = BeautifulSoup(html, 'lxml', exclude_encodings=[encoding])
            new_encoding = soup.original_encoding
            #print(soup.original_encoding)
            soup = BeautifulSoup(html, 'lxml',  exclude_encodings=[new_encoding])
            #print(soup.original_encoding)
            title = soup.find('h1')
            if title is not None:
                title_text = title.text.strip()
            article = soup.find_all("p")
            new_article = []
            for i in article:
                dammit = UnicodeDammit(i.text.strip().encode(), smart_quotes_to="ascii").unicode_markup
                new_article.append((dammit.encode('ascii', 'ignore')).decode("utf-8"))
            sentences = []
            for i in new_article:
                sentences.append(sent_tokenize(i))
            sentences = sum(sentences, [])
            return sentences
        except TypeError:
            print("Type error with url: ", url)
    
def multi_sites(url_list):
    all_sents = []
    new_url = []
    for i in url_list:
        this_url_sents = get_sentences(i)
        if not isinstance(this_url_sents, type(None)):
            new_url.append(i)
            all_sents.append(this_url_sents)
    all_sents = sum(all_sents,[])
    return all_sents, new_url

In [31]:

def save_csv(batch, file_name):
    batch = np.asarray(batch)
    pd_df = pd.DataFrame(batch)
    pd_df.to_csv(file_name, index = False, header = False)

In [48]:
def grammar_check(batch):
    grammar_tool = language_check.LanguageTool('en-GB')
    checked = []
    for i in batch:
        corrected = ''
        matches = grammar_tool.check(i)
        checked.append(language_check.correct(i, matches))
    return checked


In [32]:
first_batch_url =['http://www.dating-relationship-advice-for-women.com/what-do-men-want/', 
                  'http://www.dating-relationship-advice-for-women.com/does-he-like-me/',
                  'http://www.dating-relationship-advice-for-women.com/seducing-men/',
                  'http://www.dating-relationship-advice-for-women.com/signs-of-flirting/',
                  'http://www.dating-relationship-advice-for-women.com/attract-a-guy.html',
                  'http://www.dating-relationship-advice-for-women.com/is-he-the-one/',
                  'http://www.dating-relationship-advice-for-women.com/money-or-love/',
                  'http://www.dating-relationship-advice-for-women.com/sugar-daddy.html',
                  'http://www.dating-relationship-advice-for-women.com/shy-guy/',
                  'http://www.dating-relationship-advice-for-women.com/too-old-to-date/'
                  ]

In [33]:
first_batch, first_url_list = multi_sites(first_batch_url)

In [85]:
len(first_batch)

653

In [97]:
first_batch

['What do men want is a brilliant question that has confused women since the beginning of time.',
 'Men are strange, yet often simple creatures who are on a constant quest to define themselves in a world that no longer cares what they accomplish.',
 'There are no valiant causes to fight for.',
 'There are no more glaring injustices to stand against.',
 'Even if he went off to fight in any of the current worldwide conflicts, he would not really be looked upon with pride.',
 "He's more likely to be met with the sadness and grief that war creates.",
 'Men are lost.',
 'The things that used to mean something have lost their meaning so meaningless pursuits have taken their place.',
 'All you have to do is listen to the messages of pop culture to see that disrespect and criminal behavior are the new pursuits.',
 'If you asked a man today what he wants, he might not even be able to tell you.',
 'This leads us to you, the last true and traditional pursuit that a man can be proud of.',
 'The an

In [98]:
first_url_list

['http://www.dating-relationship-advice-for-women.com/what-do-men-want/',
 'http://www.dating-relationship-advice-for-women.com/does-he-like-me/',
 'http://www.dating-relationship-advice-for-women.com/seducing-men/',
 'http://www.dating-relationship-advice-for-women.com/signs-of-flirting/',
 'http://www.dating-relationship-advice-for-women.com/is-he-the-one/',
 'http://www.dating-relationship-advice-for-women.com/money-or-love/',
 'http://www.dating-relationship-advice-for-women.com/shy-guy/',
 'http://www.dating-relationship-advice-for-women.com/too-old-to-date/']

In [46]:
now_what = grammar_check(first_batch)

In [47]:
now_what

['What do men want is a brilliant question that has confused women since the beginning of time.',
 'Men are strange, yet often simple creatures who are on a constant quest to define themselves in a world that no longer cares what they accomplish.',
 'There are no valiant causes to fight for.',
 'There are no more glaring injustices to stand against.',
 'Even if he went off to fight in any of the current worldwide conflicts, he would not really be looked upon with pride.',
 "He's more likely to be met with the sadness and grief that war creates.",
 'Men are lost.',
 'The things that used to mean something have lost their meaning so meaningless pursuits have taken their place.',
 'All you have to do is listen to the messages of pop culture to see that disrespect and criminal behaviour are the new pursuits.',
 'If you asked a man today what he wants, he might not even be able to tell you.',
 'This leads us to you, the last true and traditional pursuit that a man can be proud of.',
 'The a

In [39]:
save_csv(now_what, './first_batch.csv')

In [13]:
second_batch_url = ['http://www.dating-relationship-advice-for-women.com/how-to-be-attractive/',
                    'http://www.dating-relationship-advice-for-women.com/get-a-guy-to-ask-you-out/',
                    'http://www.dating-relationship-advice-for-women.com/first-date-tips/',
                    'http://www.dating-relationship-advice-for-women.com/signs-he-is-into-you/',
                    'http://www.dating-relationship-advice-for-women.com/dealing-with-jealousy/',
                    'http://www.dating-relationship-advice-for-women.com/how-to-spot-a-cheater/',
                    'http://www.dating-relationship-advice-for-women.com/signs-of-infidelity/',
                    'http://www.dating-relationship-advice-for-women.com/emotional-affair/',
                    'http://www.dating-relationship-advice-for-women.com/is-my-man-cheating/'
                   ]

In [14]:
second_batch, second_url_list = multi_sites(second_batch_url)

In [15]:
len(second_batch)

728

In [16]:
second_batch

['When you look at it, the people who know how to be attractive get everything theyve always wanted in life.',
 'Were not just talking about being born the beautiful daughter of two famous supermodel parents.',
 'Were talking about true beauty.',
 'The kind of irresistible charisma of a person who has worked hard on making their lives incredible.',
 'Do you know people like this?',
 'Maybe its one of your friends.',
 'She walks into the room and steals everyones breath by the sheer force of her presence.',
 'Shes funny, smart, kind, and personable.',
 'You miss her when shes not there.',
 'She has all the great qualities that know just how to attract a man.',
 'Whats keeping you from being that person?',
 'She wasnt born charming!',
 'She didnt pop out of her mother and immediately start in with an anecdote about the cute guy who helped her find her binky.',
 'Being charismatic is something you have to learn.',
 'Although your genes make you crave fascinating people, they donEUR(TM)t g

In [18]:
second_url_list

['http://www.dating-relationship-advice-for-women.com/how-to-be-attractive/',
 'http://www.dating-relationship-advice-for-women.com/get-a-guy-to-ask-you-out/',
 'http://www.dating-relationship-advice-for-women.com/first-date-tips/',
 'http://www.dating-relationship-advice-for-women.com/signs-he-is-into-you/',
 'http://www.dating-relationship-advice-for-women.com/dealing-with-jealousy/',
 'http://www.dating-relationship-advice-for-women.com/how-to-spot-a-cheater/',
 'http://www.dating-relationship-advice-for-women.com/signs-of-infidelity/',
 'http://www.dating-relationship-advice-for-women.com/emotional-affair/',
 'http://www.dating-relationship-advice-for-women.com/is-my-man-cheating/']

In [44]:
now_what_second = grammar_check(second_batch)

In [17]:
save_csv(now_what_second, './second_batch.csv')