In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Method to parse the structure of an html page using package beautifulsoup.
# The code looks for specific tags in the html structure and extracts the content
def getArticleDetailsByUrl(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text,"html.parser")
    #soup.prettify()
    
    headline = soup.title.string
    subheadline = soup.head.find("meta",attrs={"name":"description"}).get('content')

    doc_body = ''
    if "The Irish Times" in soup.text:
        for body_p_tag in soup.article.find_all("p", attrs={"class": "no_name"}):
            doc_body += body_p_tag.get_text() + " "

    source = "Other"
    try:
        if "irishtimes" in url:
            source = "IrishTimes"
            body_p_tag = soup.article.find("div", attrs={"class": "last_updated"}).find("p")
    except:
        pass

    first_sentence = doc_body.split(".")[0]

    return [headline, subheadline, first_sentence, doc_body, source]


In [4]:
#Feedparser is a library to parse RSS/XML feeds, these are files with a specific XML structure
#If you don't have it, install using conda or pip, e.g.,: pip install feedparser
import feedparser
#help(feedparser)

#Parse the XML file to retrieve the URLs for individual news articles.
#Parse each article's HTML page
def scrapRSSFeed(rss_feed):
    d = feedparser.parse(rss_feed)
    #print(d)
    #print(d['entries'], "\n")
    df = pd.DataFrame()
        
    for item in d['entries']:
        #Extract an article URL
        article_url = item['link']
        [headline, subheadline, first_sentence, doc_body, source] = getArticleDetailsByUrl(article_url)
        row = [headline, doc_body]
        df = pd.concat([df, pd.Series(row)], axis=1)
        print(df)
    df = df.T
    df.columns = ['Title', 'Content']
        
    return df

In [5]:
#The URL of the XML file
url='http://www.irishtimes.com/cmlink/news-1.1319192'
xml_page = requests.get(url).text

df = scrapRSSFeed(url)
df = df.reset_index(drop=True)
df.index = (df.index+1).map("{0:03}".format)
df.index.name = 'Article ID'
df

                                                   0
0  New storm could strike this weekend, says Met ...
1  Met Éireann is monitoring whether another stor...
                                                   0  \
0  New storm could strike this weekend, says Met ...   
1  Met Éireann is monitoring whether another stor...   

                                                   0  
0  Consultants warn industrial action over two-ti...  
1  The prospect of industrial action by consultan...  
                                                   0  \
0  New storm could strike this weekend, says Met ...   
1  Met Éireann is monitoring whether another stor...   

                                                   0  \
0  Consultants warn industrial action over two-ti...   
1  The prospect of industrial action by consultan...   

                                                   0  
0  ‘Irish Passport’ among most popular searches o...  
1  ‘Irish citizenship’ and ‘Irish passport’ were ...  
    

                                                   0  \
0  New storm could strike this weekend, says Met ...   
1  Met Éireann is monitoring whether another stor...   

                                                   0  \
0  Consultants warn industrial action over two-ti...   
1  The prospect of industrial action by consultan...   

                                                   0  \
0  ‘Irish Passport’ among most popular searches o...   
1  ‘Irish citizenship’ and ‘Irish passport’ were ...   

                                                   0  \
0  Many trees in storm fell because of earlier dr...   
1  Many trees that fell during Storm Ali were vul...   

                                                   0  \
0  Connemara locals shocked at Swiss woman’s deat...   
1  A north Connemara community has expressed shoc...   

                                                   0  \
0      Controversial Howth development gets go ahead   
1  A controversial development in the fishi

                                                   0  \
0  New storm could strike this weekend, says Met ...   
1  Met Éireann is monitoring whether another stor...   

                                                   0  \
0  Consultants warn industrial action over two-ti...   
1  The prospect of industrial action by consultan...   

                                                   0  \
0  ‘Irish Passport’ among most popular searches o...   
1  ‘Irish citizenship’ and ‘Irish passport’ were ...   

                                                   0  \
0  Many trees in storm fell because of earlier dr...   
1  Many trees that fell during Storm Ali were vul...   

                                                   0  \
0  Connemara locals shocked at Swiss woman’s deat...   
1  A north Connemara community has expressed shoc...   

                                                   0  \
0      Controversial Howth development gets go ahead   
1  A controversial development in the fishi

                                                   0  \
0  New storm could strike this weekend, says Met ...   
1  Met Éireann is monitoring whether another stor...   

                                                   0  \
0  Consultants warn industrial action over two-ti...   
1  The prospect of industrial action by consultan...   

                                                   0  \
0  ‘Irish Passport’ among most popular searches o...   
1  ‘Irish citizenship’ and ‘Irish passport’ were ...   

                                                   0  \
0  Many trees in storm fell because of earlier dr...   
1  Many trees that fell during Storm Ali were vul...   

                                                   0  \
0  Connemara locals shocked at Swiss woman’s deat...   
1  A north Connemara community has expressed shoc...   

                                                   0  \
0      Controversial Howth development gets go ahead   
1  A controversial development in the fishi

Unnamed: 0_level_0,Title,Content
Article ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"New storm could strike this weekend, says Met ...",Met Éireann is monitoring whether another stor...
2,Consultants warn industrial action over two-ti...,The prospect of industrial action by consultan...
3,‘Irish Passport’ among most popular searches o...,‘Irish citizenship’ and ‘Irish passport’ were ...
4,Many trees in storm fell because of earlier dr...,Many trees that fell during Storm Ali were vul...
5,Connemara locals shocked at Swiss woman’s deat...,A north Connemara community has expressed shoc...
6,Controversial Howth development gets go ahead,A controversial development in the fishing vil...
7,Man dies and colleague is injured as tree fall...,"A man in his 20s died and a man in his 40s, hi..."
8,"After Storm Ali cancellation, exhibitors will ...","The decision wasn’t taken lightly, but it had ..."
9,Government to review policy on no driving lice...,A review is underway as to whether asylum seek...
10,Ambulance staff to stage industrial action nex...,Staff in the National Ambulance Service who ar...


In [124]:

# Import the necessary package to process data in JSON format
import json
# Import the necessary methods from "twitter" library
# Twitter API returns data in JSON format
from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream

# Variables that contains the user credentials to access Twitter API 
# ACCESS_TOKEN = 'YOUR ACCESS TOKEN"'
# ACCESS_SECRET = 'YOUR ACCESS TOKEN SECRET'
# CONSUMER_KEY = 'YOUR API KEY'
# CONSUMER_SECRET = 'ENTER YOUR API SECRET'
ACCESS_TOKEN = '2839893905-pBXUzdrHCNXyjfPuBpSwxNbH1zyEpRaa2sXK0Jd'
ACCESS_SECRET = 'eNtB7YTAfsMhPIQtKji8aQT7zQFpFfDPR2lQ89WKfgI1U'
CONSUMER_KEY = 'ZqPrfLpc0znZlz3kW2a22VmUa'
CONSUMER_SECRET = 'BHD19T0DmUV2XVvEhUAgvpXMx0nGfxevAtr53NbCd9jQjPyTqn'

oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)

# Initiate the connection to Twitter Streaming API
twitter_stream = TwitterStream(auth=oauth)

# Get a sample of the public data published on Twitter in real-time
#iterator = twitter_stream.statuses.sample()
# Get a sample of tweets in English, containing #analytics"
iterator = twitter_stream.statuses.filter(track="analytics", language="en")

# Print each tweet in the stream to the screen 
# Here we set it to stop after getting 100 tweets. 
# You don't have to set it to stop, but can continue running 
# the Twitter API to collect data for days or even longer. 
tweet_count = 10
file = open("data_analytics_twitter_stream_10tweets.json", "w") 
df = pd.DataFrame()

for tweet in iterator:
    tweet_count -= 1
    # Twitter Python Tool wraps the data returned by Twitter 
    # as a TwitterDictResponse object.
    # We convert it back to the JSON format to print/score
    #print(json.dumps(tweet))  
    file.write(json.dumps(tweet)+"\n")

    # The command below will do pretty printing for JSON data, try it out
    #print(json.dumps(tweet, indent=4))
    
    if len(tweet['entities']['user_mentions'])>0:
        user_mentions = tweet['entities']['user_mentions'][0]['screen_name']
    else:
        user_mentions = 'None'
        
    row = [tweet['user']['screen_name'], tweet['text'], tweet['retweet_count'], user_mentions]
    df = pd.concat([df, pd.Series(row)], axis=1)
       
    if tweet_count <= 0:
        break
        
df = df.T.reset_index(drop=True)
df.columns = ['User ID', 'Text', 'No. Retweet', 'User Retweet']
        
file.close()

In [125]:
df

Unnamed: 0,User ID,Text,No. Retweet,User Retweet
0,the_nayans,#data #lifecycle is an important decision poin...,0,
1,7wData,RT @DTPGaming12: RT @GameAndroidnews: RT @ahme...,0,DTPGaming12
2,FoodmfgUK,RT @Acuvate: The Secret Solutions That Increas...,0,Acuvate
3,fly775471,@Cryptocrackpipe Posted... \nhttps://t.co/a2O0...,0,Cryptocrackpipe
4,prathap041988,"RT @naralokesh: Met Kirk Bresniker, Hewlett Pa...",0,naralokesh
5,Radika_Panjwani,RT @onlinerecruiter: The Rise Of #Content 4.0 ...,0,onlinerecruiter
6,technocom_lb,The data you need w/speed : \nUp to 20.8x fast...,0,
7,dwaguide,"RT @matillion: Are you a #CIO, #DWH expert, #B...",0,matillion
8,alialmiqbali,RT @coindesk: JUST IN: Coinbase has brought Li...,0,coindesk
9,fnechz,RT @SWC_iHewa: During @TheAGRF leaders @Strive...,0,SWC_iHewa
