In [1]:
import requests
import json
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Get data through API
baseURL = "https://newsapi.org/v2/everything?"
total_requests=2
verbose=True

API_KEY='2c4411dde9154e11bf6518a2ea3cf192'
TOPIC='education trend'

URLpost = {'apiKey': API_KEY,
            'q': '+'+TOPIC,
            'sortBy': 'relevancy',
            'totalRequests': 1}

print(baseURL)
# print(URLpost)

#GET DATA FROM API
response = requests.get(baseURL, URLpost) #request data from the server
# print(response.url);  
response = response.json() #extract txt data from request into json

print(json.dumps(response, indent=2))

# #GET TIMESTAMP FOR PULL REQUEST
from datetime import datetime
timestamp = datetime.now().strftime("%Y-%m-%d-H%H-M%M-S%S")

# SAVE TO FILE 
with open(timestamp+'-newapi-raw-data.json', 'w') as outfile:
    json.dump(response, outfile, indent=4)





https://newsapi.org/v2/everything?
{
  "status": "ok",
  "totalResults": 1414,
  "articles": [
    {
      "source": {
        "id": null,
        "name": "Gizmodo.com"
      },
      "author": "Mack DeGeurin",
      "title": "School Surveillance Tools Are Harming Kids and Making It More Difficult to Finish Homework, Report Finds",
      "description": "The pandemic-era shift to remote learning sparked an acceleration of an already growing trend: education technology companies jockeying to sell schools on systems for monitoring students and filtering their internet browsing. Though school administrators will\u2026",
      "url": "https://gizmodo.com/school-surveillance-harm-kids-homework-report-1850854919",
      "urlToImage": "https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_auto,fl_progressive,g_center,h_675,pg_1,q_80,w_1200/40ac37b8e9ad66e1dd71c2cbd86269bc.jpg",
      "publishedAt": "2023-09-23T12:00:00Z",
      "content": "The pandemic-era shift to remote learning sparked

In [2]:
def string_cleaner(input_string):
    try: 
        out=re.sub(r"""
                    [,.;@#?!&$-]+  # Accept one or more copies of punctuation
                    \ *           # plus zero or more copies of a space,
                    """,
                    " ",          # and replace it with a single space
                    input_string, flags=re.VERBOSE)

        #REPLACE SELECT CHARACTERS WITH NOTHING
        out = re.sub('[’.]+', '', input_string)

        #ELIMINATE DUPLICATE WHITESPACES USING WILDCARDS
        out = re.sub(r'\s+', ' ', out)

        #CONVERT TO LOWER CASE
        out=out.lower()
    except:
        print("ERROR")
        out=''
    return out


article_list=response['articles']   #list of dictionaries for each article
article_keys=article_list[0].keys()
print("AVAILABLE KEYS:")
print(article_keys)
index=0
cleaned_data=[];  
for article in article_list:
    tmp=[]
    if(verbose):
        print("#------------------------------------------")
        print("#",index)
        print("#------------------------------------------")

    for key in article_keys:
        if(verbose):
            print("----------------")
            print(key)
            print(article[key])
            print("----------------")

        if(key=='source'):
            src=string_cleaner(article[key]['name'])
            tmp.append(src) 

        if(key=='author'):
            author=string_cleaner(article[key])
            #ERROR CHECK (SOMETIMES AUTHOR IS SAME AS PUBLICATION)
            if(src in author): 
                print(" AUTHOR ERROR:",author);author='NA'
            tmp.append(author)

        if(key=='title'):
            tmp.append(string_cleaner(article[key]))

        # if(key=='description'):
        #     tmp.append(string_cleaner(article[key]))

        # if(key=='content'):
        #     tmp.append(string_cleaner(article[key]))

        if(key=='publishedAt'):
            #DEFINE DATA PATERN FOR RE TO CHECK  .* --> wildcard
            ref = re.compile('.*-.*-.*T.*:.*:.*Z')
            date=article[key]
            if(not ref.match(date)):
                print(" DATE ERROR:",date); date="NA"
            tmp.append(date)

    cleaned_data.append(tmp)
    index+=1


AVAILABLE KEYS:
dict_keys(['source', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content'])
#------------------------------------------
# 0
#------------------------------------------
----------------
source
{'id': None, 'name': 'Gizmodo.com'}
----------------
----------------
author
Mack DeGeurin
----------------
----------------
title
School Surveillance Tools Are Harming Kids and Making It More Difficult to Finish Homework, Report Finds
----------------
----------------
description
The pandemic-era shift to remote learning sparked an acceleration of an already growing trend: education technology companies jockeying to sell schools on systems for monitoring students and filtering their internet browsing. Though school administrators will…
----------------
----------------
url
https://gizmodo.com/school-surveillance-harm-kids-homework-report-1850854919
----------------
----------------
urlToImage
https://i.kinja-img.com/gawker-media/image/upload/c_fill,f_aut

In [9]:

df = pd.DataFrame(cleaned_data)
print(df)

df.to_csv("cleaned_text_data")

vectorizer=CountVectorizer()

vec =  vectorizer.fit_transform(df[2])   


print("vocabulary = ",vectorizer.vocabulary_)   

print(vec)




                        0                                   1  \
0              gizmodocom                       mack degeurin   
1               readwrite                   meenakshi kalyani   
2        business insider                      beatrice nolan   
3                     gma                      katie kindelan   
4                bbc news      https://wwwfacebookcom/bbcnews   
..                    ...                                 ...   
95               abc news                    nathaniel rakich   
96                 forbes                                  NA   
97  realcleareducationcom  donald nielsen, realcleareducation   
98     americanthinkercom                                       
99                 forbes                                  NA   

                                                    2                     3  
0   school surveillance tools are harming kids and...  2023-09-23T12:00:00Z  
1   top 10 enterprise software development compani...  2023-09-