# Solution: Extension Task 1 & 2

In [9]:
# Import Libraries
import requests
import feedparser
from bs4 import BeautifulSoup
import json
import pandas as pd
from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream

## Task 01:

In [10]:
# Function to get data from HTML of IrishTimes
def getArticleDetailsByUrl(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text,"html.parser")
    #soup.prettify()
    
    headline = soup.title.string
    subheadline = soup.head.find("meta",attrs={"name":"description"}).get('content')

    doc_body = ''
    if "The Irish Times" in soup.text:
        for body_p_tag in soup.article.find_all("p", attrs={"class": "no_name"}):
            doc_body += body_p_tag.get_text() + " "

    source = "Other"
    try:
        if "irishtimes" in url:
            source = "IrishTimes"
            body_p_tag = soup.article.find("div", attrs={"class": "last_updated"}).find("p")
    except:
        pass

    first_sentence = doc_body.split(".")[0]

    return [headline, subheadline, first_sentence, doc_body, source]

In [11]:
# Parse the Irish RSSFeed
RSSfeed = feedparser.parse('http://www.irishtimes.com/cmlink/news-1.1319192')

indx = 1
df1 = pd.DataFrame([],columns=['Article_ID','Title','Content']) # dataframe to store data

for item in RSSfeed['entries']:
    article_url = item['link']
    [headline, subheadline, first_sentence, doc_body, source] = getArticleDetailsByUrl(article_url) 
    
    # Append data into dataframe
    df1 = df1.append({'Article_ID':int(indx), 'Title':headline, 'Content':doc_body},ignore_index=True)
    indx = indx + 1

# Set the index of dataframe into Article_ID
df1.set_index('Article_ID', inplace=True)
df1

Unnamed: 0_level_0,Title,Content
Article_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,TG4 apologises after CGI clip shows caravan ki...,TG4 has apologised after a computer-generated ...
2,Armed man commits robbery at Killiney Shopping...,A robbery took place at Killiney Shopping Cent...
3,Fisherman in court charged with murdering woma...,A 48-year-old man appeared in court in Dundalk...
4,"Detailed backstop proposals urgently needed, V...",The UK needs to put detailed proposals on the ...
5,Decision to prosecute woman over abortion pill...,The high court in Belfast is set to hear a le...
6,City vacant: Dublin’s hundreds of multi-millio...,Almost 200 sites and properties worth millions...
7,Potentially ‘nasty’ storm may hit Ireland this...,Met Eireann is monitoring a weather front whic...
8,"Bunny Carr, former RTÉ presenter, dies aged 91","Bunny Carr, the former RTÉ presenter and found..."
9,Dundalk murder: Man (48) to appear in court ov...,A man (48) is due to appear at Dundalk Distric...
10,The dodgy science of wiping your backside,Science is now a massive and expensive interna...


In [13]:
# Check the value again for dataframe:
print('Title of first news: \n', df1.iloc[0].Title)
print('Content of first news: \n', df1.iloc[0].Content)

Title of first news: 
 TG4 apologises after CGI clip shows caravan kicked off cliff
Content of first news: 
 TG4 has apologised after a computer-generated imagery (CGI) clip broadcast before a news bulletin on Wednesday showed a caravan being kicked off a cliff into the sea.  On Wednesday a woman in her 50s was killed during Storm Ali, when a caravan she was staying in was swept off the coast by strong winds.  The woman, who has been named as Elvira Ferraii from Switzerland, was staying on her own in the caravan at the Clifden Eco Beach Camping and Caravan Park near Claddaghduff, Co Galway, when the incident occurred at about 7.45am. On Wednesday a TG4 clip depicted a scene of a giant creature carrying a caravan, before kicking it into the sea. The clip was shown before the 7pm Nuacht TG4 bulletin, which led with the news of Ms Ferraii’s death.  TG4 apologised for the clip, which it said ran on air only once on Wednesday.  “The ident [a broadcasting term which is short for station iden

## Task 02:

In [107]:
def getTweetData(filename):
    columns=['User_ID', 'User_name','Content', 'No_Retweets', 'User_Retweet']
    df = pd.DataFrame([],columns=columns) # dataframe to store data
    
    with open(filename) as file:
        all_tweets = map(json.loads, file.readlines()) # load each json line
        for single_tweet in all_tweets:
            user_id = single_tweet['user']['id']               # returns twitter user id
            user_name = single_tweet['user']['screen_name']    # returns twitter user name
            text = single_tweet['text']                        # returns content of tweet
            retweet_count = single_tweet['retweet_count']      # returns retweet count

            # Combine cases # returns retweeted user
            user_retweet =''
            if 'retweeted_status' in single_tweet:
                user_retweet = user_retweet + (single_tweet['retweeted_status']['user']['screen_name'])
            elif 'quoted_status' in single_tweet:
                user_retweet = user_retweet + " \nQ: " + single_tweet['quoted_status']['user']['screen_name']
            else:
                user_retweet = '<None>'
            
            # Append data into dataframe
            df = df.append({'User_ID':user_id, 
                            'User_name':user_name, 
                            'Content':text,
                            'No_Retweets':retweet_count,
                            'User_Retweet':user_retweet},ignore_index=True)
    return df

In [108]:
## Check function with 'twitter_search_100tweets.json'
df2 = getTweetData("twitter_search_100tweets.json")
df2.head(10)

Unnamed: 0,User_ID,User_name,Content,No_Retweets,User_Retweet
0,417482440,Pvalsfr,RT @gp_pulipaka: Time Series Forecasting Using...,29,gp_pulipaka
1,981606042034372608,GDPR_for_SME,RT @HarbRimah: Cybersecurity and Small Busines...,14,HarbRimah
2,138054648,AaronNahale,RT @kashthefuturist: Chinese police are using ...,145,kashthefuturist
3,930871517860319232,TheBigDataBot,RT @alison_iot: 🔭A #ContentMarketing Planning ...,9,alison_iot
4,138054648,AaronNahale,RT @KirkDBorne: 50 Shades of Data:\n#AI\n#Anal...,39,KirkDBorne
5,138054648,AaronNahale,RT @Fisher85M: Growth in the Internet of Thing...,10,Fisher85M
6,772361408956731392,Manifattura40,RT @InsightBrief: What #DataScientists really ...,1,InsightBrief
7,769301256204541952,ThriveAmbition,True Cost of Attrition: Why Good People Leave ...,0,<None>
8,260819964,freetoopt,RT @Ronald_vanLoon: 60+ Free Books on #BigData...,5,Ronald_vanLoon
9,1014177978215886848,AppliedAI1,RT @Fisher85M: The Fields of #ArtificialIntell...,23,Fisher85M


In [109]:
## Check function with 'data_analytics_twitter_stream_10tweets.json'
df3 = getTweetData("data_analytics_twitter_stream_10tweets.json")
df3.head(10)

Unnamed: 0,User_ID,User_name,Content,No_Retweets,User_Retweet
0,417482440,Pvalsfr,RT @gp_pulipaka: Hybrid SDN of Industrial Inte...,0,gp_pulipaka
1,922987253286297600,gemini_finance,How to get hired as a #datascientist - be proa...,0,<None>
2,823520005706883072,Max_Lyashko,RT @schmarzo: #EdgeAnalytics with #IoT environ...,0,schmarzo
3,973110160163033088,Pain4200,RT @vg_fco: Harvard Business Review : The Key ...,0,vg_fco
4,206386815,supplychain,The two are designing a new all-in-one invento...,0,<None>
5,823520005706883072,Max_Lyashko,RT @Fisher85M: Growth in the Internet of Thing...,0,Fisher85M
6,26833196,rlingle,Sharing trends in Big Data and Data Analytics ...,0,<None>
7,64869385,Randumb_Rants,"Simple, stupid ass manager + stupid trade dead...",0,\nQ: phillysport
8,34377314,dorbendov,Digital transformation: Amdocs Revenue Guard t...,0,<None>
9,919744697551253505,akdm_bot,RT @IBMSPSS: #ICYMI: 💪 💥 Discover this comic b...,0,IBMSPSS
