### Scrape news articles 

In [1]:
import os.path
from os import path
import pandas as pd
import json
import boto3 
import os
from io import StringIO
from summarizer import Summarizer
import pickle
import psycopg2
from sqlalchemy import create_engine

In [2]:
import pprint
import requests

In [3]:
# API Key for News API
secret = '<key>'

In [4]:
# s3 secret
# s3 secret
ACCESS_KEY ='<key>'
SECRET_KEY = '<key>'
BUCKET_NAME ='<key>'

In [5]:
jdbcUrl='<key>'

In [6]:
s3 = boto3.resource(
    's3',
    region_name='us-east-1',
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY
).Bucket(BUCKET_NAME)

json.load_s3 = lambda f: json.load(s3.Object(key=f).get()["Body"])
json.dump_s3 = lambda obj, f: s3.Object(key=f).put(Body=json.dumps(obj))

In [7]:
def savePickle(object, filename, protocol = pickle.HIGHEST_PROTOCOL):
    pickle.dump(object, open(filename, "wb",),protocol)

def loadPickle(filename):
    return pickle.load(open(filename, "rb"))

In [8]:
# Define the endpoint to extract all the top headlines 
url = 'https://newsapi.org/v2/top-headlines?'

In [9]:
# Specify the query and number of returns - Limit the headlines to country US, for now
parameters = {
    'language': 'en',
    'country':'us',
    'pageSize': 100,
    'apiKey': secret 
}

In [10]:
categories = ['business','entertainment','general','health','science','sports','technology']

In [11]:
from newspaper import Article
from newspaper import Config

article_title = []
article_authors = []
article_text = []
article_summary = []
article_date = []
article_top_image = []
failed_url = []
category_articles ={}
pages= range(1,2)

In [None]:
model = Summarizer()

In [None]:
count =0
#model = Summarizer()
first = True
for category in categories:
    name = category
    print(name)
    fileName= f'resources/{category}_top_headline_data_new.json'
    json_buffer=json.load_s3('Project3/'+fileName)
    category_dataframe = pd.read_json(json_buffer).T    
    parameters['category'] = category
    response = requests.get(url, params=parameters)
    if response.status_code != requests.codes.ok:
        print(f"Bad result : {response.url}")
        continue
    response_json = response.json()

    df = pd.DataFrame.from_dict(response_json)
    df = pd.concat([df.drop(['articles'], axis=1), df['articles'].apply(pd.Series)], axis=1) 

    rows = []
    for index, row in df.iterrows():
        h = row['url']
        # skip record if we already downloaded article
        if h in category_dataframe['url'].values:
            continue 
        user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        config = Config()
        config.verbose=True
        config.browser_user_agent = user_agent
        newsarticle = Article(h, config=config)
        try :
            newsarticle.download()
            newsarticle.parse()
            if not newsarticle.text :
                print(f"Unable to parse article: {h}")
                continue 
            artdict = {}                
            artdict['articleText']=newsarticle.text
            # Get article Summary            
            summaryResult = model(newsarticle.text, min_length=60)
            full_summary = ''.join(summaryResult)
            if not full_summary :
                #no Summary was generated
                artdict['articleSummary']=''
            else:
                row['articleSummary']=full_summary
            artdict['articleSentiment']='' # Place Holder for calculated Sentiment Analysis
            for column in df:
                #print(column)
                if column in ['status', 'totalResults','content'] :
                    continue 
                artdict[column]= row[column]
            rows.append(artdict)
            count += 1
            if (count % 50 == 0 ):
                print('Number of articles: '+str(count))

        except :
             print(f'***FAILED TO DOWNLOAD***{newsarticle.url}')
             pass

    print(f"Using DataFrame {category_dataframe.shape} Adding {len(rows)}" )
    if len(rows) > 0 :
        category_dataframe = category_dataframe.append(pd.DataFrame(rows),ignore_index=True).drop_duplicates(subset='url')
        
    if first:
        complete_dataframe=category_dataframe
        first = False 
    else :
        df = pd.read_json(json_buffer).T
        complete_dataframe = complete_dataframe.append(category_dataframe,ignore_index=True)        

    json_buffer = StringIO()
    category_dataframe.to_json(json_buffer, orient='index')
    json.dump_s3(json_buffer.getvalue(),'Project3/'+fileName)

print('Total number of articles: '+str(count))
print(complete_dataframe.shape)
    

In [None]:
print(complete_dataframe.shape)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.svm import SVC 
import gensim
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

In [None]:
complete_dataframe['articleSummary'].head()

In [None]:
remove_words = ['i', 'your', 'you', 'on', 'with', 'and', 'have', 'the', 'to', 'in', 'for', 'that', 'had', 'be', 'a', 'year'
               'it', 'may', 'one', 'as', 'if', 'is', 'via', 'this', 'will', 'david', 'jenni', 'im', 'susan', 'it', 'up',
               'angelica', 'hi', 'hello', 'we', 'our', 'all', 'kelli', 'yes', 'gavin', 'our', 'were', 'of', 'can', 'at',
               'any', 'by', 'also', 'joe', 'ronnie', 'morning', 'evening', 'good', 'what', 'okay', 'ok', 'are', 'us', 'my',
               'th', 'st', 'nd', 'rd', 'was', 'there', 'then', 'lee', 'out', 'or', 'so','alan', 'from','unfortunately',
               'alason', 'but', 'youre', 'does', 'heres', 'little', 'more', 'set', 'br', 'dr', 'ave', 'here', 'about', 'an',
               'let', 'know', 'than', 'then', 'no', 'why', 'way', 'every', 'thats', 'taken', 'today', 'way', 'id', 'isnt', 
               'only', 'bay', 'me', 'when', 'want', 'end', 'month', 'do', 'til', 'get', 'back', 'thanks', 'bonnie', 'woodal',
               'off', 'drive', 'michell','and', 'he','she', 'her','just']

In [None]:
print(complete_dataframe.shape)

In [None]:
import re
import string

def clean_text_round(text):
    
    text = str(text).lower()
    text = re.sub('\[.*?\]', '',text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[''""...]','',text)
    text = re.sub('\n','',text)
    text = re.sub('\d+', '', text)
    text = re.sub('[^\w\s]','', text)  # remove punctuation 
    
    return text

complete_dataframe.articleSummary = complete_dataframe.articleSummary.str.replace('\d+', '') # remove numbers
get_cleaning = lambda x: clean_text_round(x)
complete_dataframe.articleSummary = pd.DataFrame(complete_dataframe.articleSummary.apply(get_cleaning))


In [None]:
complete_dataframe['articleSummary'] = complete_dataframe['articleSummary'].apply(lambda x: gensim.parsing.preprocessing.remove_stopwords(x))


In [None]:
print(complete_dataframe.shape)

In [None]:
def apply_remove_words_to_texts():
    pat = r'\b(?:{})\b'.format('|'.join(remove_words))
    complete_dataframe["articleSummary_new"] = complete_dataframe['articleSummary'].str.replace(pat, '')
    complete_dataframe["articleSummary_new"] = complete_dataframe['articleSummary_new'].str.strip()
    return complete_dataframe
complete_dataframe = apply_remove_words_to_texts()

In [None]:
print(complete_dataframe.shape)

In [None]:
complete_dataframe = apply_remove_words_to_texts()
complete_dataframe.articleSummary_new.head()

In [None]:
# vectorizer = TfidfVectorizer(min_df = 5,
#                              max_df = 0.8,
#                              sublinear_tf = True,
#                              use_idf = True)

vectorizer = loadPickle('final_model/vectorizer.sav')

news_vectors = vectorizer.transform(complete_dataframe['articleSummary_new']).toarray()


In [None]:
news_vectors.shape

In [None]:
svm_model = loadPickle('final_model/gensim_svm_model.sav')

In [None]:
predicted = svm_model.predict(news_vectors)

In [None]:
complete_dataframe['articleSentiment']=predicted

In [None]:
# return source Name
complete_dataframe['source'] =complete_dataframe['source'].apply(lambda x: x['name'])

In [None]:
complete_dataframe.head()

In [None]:
# set date
complete_dataframe['publishedAt']= pd.to_datetime(complete_dataframe['publishedAt'])

# Save data to postgress

In [None]:
complete_dataframe.rename(columns = {'urlToImage':'urltoimage', 
                           'publishedAt':'publishedat',
                           'articleSummary':'articlesummary',
                           'articleSentiment':'articlesentiment'},inplace=True)

In [None]:
alchemyEngine   = create_engine(jdbcUrl, pool_recycle=3600);
postgreSQLConnection    = alchemyEngine.connect();

In [None]:
postgreSQLTable= "sentiment_results"  
frame= complete_dataframe[['author', 'title', 'description', 'url',
       'urltoimage', 'publishedat', 'articlesummary', 'articlesentiment',
       'category','source']].to_sql(postgreSQLTable, postgreSQLConnection, if_exists='replace');

In [None]:
fileName= f'resources/all_top_headline_data_new.json'
json_buffer = StringIO()
complete_dataframe.to_json(json_buffer, orient='index')
json.dump_s3(json_buffer.getvalue(),'Project3/'+fileName)


In [None]:
complete_dataframe.to_json("all_top_headline_data_new.json", orient='index')
