## The urllin package
- Provides interface for fetching data across the web
- urlopen(): Accepts URLs instead of file names

In [5]:
import pandas as pd

In [6]:
from urllib.request import urlretrieve
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
urlretrieve(url, 'winequality-red.csv')
df = pd.read_csv('winequality-red.csv', sep=';')
print(df.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

## Opening and reading flat files from the web

In [8]:
"""
If you just wanted to load a file from the web into a DataFrame without first saving it locally, 
you can do that easily using pandas. 
In particular, you can use the function pd.read_csv() with the URL as the first argument 
    and the separator sep as the second argument.
"""

# Import packages
import matplotlib.pyplot as plt
import pandas as pd

# Assign url of file: url
url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'

# Read file into a DataFrame: df
df = pd.read_csv(url, sep =';')

# Print the head of the DataFrame
print(df.head())

# Plot first column of df
pd.DataFrame.hist(df.ix[:, 0:1])
plt.xlabel('fixed acidity (g(tartaric acid)/dm$^3$)')
plt.ylabel('count')
plt.show()

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

AttributeError: 'DataFrame' object has no attribute 'ix'

## Importing non-flat files from the web

In [7]:
# Import package
import pandas as pd

# Assign url of file: url
url = 'http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls'

# Read in all sheets of Excel file: xls
xls = pd.read_excel(url,sheet_name = None)

# Print the sheetnames to the shell
print(xls.keys())

# Print the head of the first sheet (using its name, NOT its index)
print(xls['1700'].head())

dict_keys(['1700', '1900'])
                 country       1700
0            Afghanistan  34.565000
1  Akrotiri and Dhekelia  34.616667
2                Albania  41.312000
3                Algeria  36.720000
4         American Samoa -14.307000


## GET requests using urllib

In [None]:
# Import packages
from urllib.request import urlopen, Request 

# Specify the url
url = "https://www.wikipedia.org/"

# This packages the request: request
request = Request(url)

# Sends the request and catches the response: response
response = urlopen(request)
html = response.read()

# Be polite and close the response!
response.close()

## GET requests using requests

In [None]:
import requests
url = "http://www.wikipedia.org/"

# Packages the request, send the request and catch the response: r
r = requests.get(url)

# Use the text attribute of the object r to return the HTML of the webpage as a string; 
    # store the result in a variable text.
text = r.text # Turns HTML to string

## Scarping the web in Python - BeautifulSoup

In [9]:
from bs4 import BeautifulSoup
import requests

# Specify url: url
url = 'https://www.crummy.com/software/BeautifulSoup/'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Use the text attribute of the object r to return the HTML of the webpage as a string
html_doc = r.text

# Create a BeautifulSoup object from the HTML
soup = BeautifulSoup(html_doc)

# Prettify the BeautifulSoup object
print(soup.prettify())

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/transitional.dtd">
<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <title>
   Beautiful Soup: We called him Tortoise because he taught us.
  </title>
  <link href="mailto:leonardr@segfault.org" rev="made"/>
  <link href="/nb/themes/Default/nb.css" rel="stylesheet" type="text/css"/>
  <meta content="Beautiful Soup: a library designed for screen-scraping HTML and XML." name="Description"/>
  <meta content="Markov Approximation 1.4 (module: leonardr)" name="generator"/>
  <meta content="Leonard Richardson" name="author"/>
 </head>
 <body alink="red" bgcolor="white" link="blue" text="black" vlink="660066">
  <style>
   #tidelift { }

#tidelift a {
 border: 1px solid #666666;
 margin-left: auto;
 padding: 10px;
 text-decoration: none;
}

#tidelift .cta {
 background: url("tidelift.svg") no-repeat;
 padding-left: 30px;
}
  </style>
  <img align="right" src="1

## Exploring BeautifulSoup

In [10]:
print(soup.title)

<title>Beautiful Soup: We called him Tortoise because he taught us.</title>


In [11]:
print(soup.get_text())




Beautiful Soup: We called him Tortoise because he taught us.









[ Download | Documentation | Hall of Fame | For enterprise | Source | Changelog | Discussion group  | Zine ]

Beautiful Soup

You didn't write that awful page. You're just trying to get some
data out of it. Beautiful Soup is here to help. Since 2004, it's been
saving programmers hours or days of work on quick-turnaround
screen scraping projects.
Beautiful Soup is a Python library designed for quick turnaround
projects like screen-scraping. Three features make it powerful:


Beautiful Soup provides a few simple methods and Pythonic idioms
for navigating, searching, and modifying a parse tree: a toolkit for
dissecting a document and extracting what you need. It doesn't take
much code to write an application

Beautiful Soup automatically converts incoming documents to
Unicode and outgoing documents to UTF-8. You don't have to think
about encodings, unless the document doesn't specify an encoding and
Beautiful Soup ca

In [None]:
"""
Use the method find_all() to find all hyperlinks in soup, 
    remembering that hyperlinks are defined by the HTML tag <a> but passed to find_all() without angle brackets; 
    store the result in the variable a_tags.
"""

# Find all 'a' tags (which define hyperlinks): a_tags
a_tags = soup.find_all('a')

# Print the URLs to the shell
for link in a_tags:
    print(link.get('href'))

## APIs
 - Application Programming Interface
 - Protocols and routines: Building and interacting with software applications

In [None]:
## Connecting to an API in python 

import requests
url = ' http://www.omdbapi.com/?t=hackers'

# Pass the variable url to the requests.get() function in order to send the relevant request and catch the response, 
    # assigning the resultant response message to the variable r.
r = requests.get(url)

# Apply the json() method to the response object r and store the resulting dictionary in the variable json_data.
json_data = r.json()

for key, value in json_data.items():
    print(key + ':', value)
    
"""
What was that URL?
1. http: making an HTTP request 
2. www.omdbapi.com: quering the OMDB API
3. ?t=hackers: 
    - Query string
    - Return data for a movie with title(t) 'Hackers'
"""

In [None]:
"""
Assign to the variable url the URL of interest in order to query 'http://www.omdbapi.com' 
    for the data corresponding to the movie The Social Network. 
The query string should have two arguments: apikey=72bc447a and t=the+social+network. 
You can combine them as follows: apikey=72bc447a&t=the+social+network.
"""

# Import requests package
import requests

# Assign URL to variable: url
url = 'http://www.omdbapi.com/?apikey=72bc447a&t=the+social+network'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Print the text of the response
print(r.text)

In [None]:
#### Checking out the Wikipedia API

# Import package
import requests

# Assign URL to variable: url
url = 'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=pizza'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Decode the JSON data into a dictionary: json_data
json_data = r.json()

# Print the Wikipedia page extract
pizza_extract = json_data['query']['pages']['24768']['extract']
print(pizza_extract)

## Loading JSONs in python


In [None]:
"""
Load the JSON 'a_movie.json' into the variable json_data within the context provided by the with statement. 
To do so, use the function json.load() within the context manager.
"""
import json
with open('a_movie.json', 'r') as json_file:
    json_data = json.load(json_file) # Will store as dict

In [None]:
for key, value in json_data.items():
    print(key + ':', value)

In [None]:
# Print each key-value pair in json_data
for k in json_data.keys():
    print(k + ': ', json_data[k])

"""
Recall that you can access a value in a dictionary using the syntax: dictionary[key].
"""

## The twitter API and Authentication

In [None]:
import tweepy, json

"""
The package tweepy is great at handling all the Twitter API OAuth Authentication details for you. 
All you need to do is pass it your authentication credentials. 
"""

# Import package
import tweepy

# Store OAuth authentication credentials in relevant variables
access_token = "1092294848-aHN7DcRP9B4VMTQIhwqOYiB14YkW92fFO8k8EPy"
access_token_secret = "X4dHmhPfaksHcQ7SCbmZa2oYBBVSD2g8uIHXsp5CTaksx"
consumer_key = "nZ6EA0FxZ293SxGNg8g8aP0HM"
consumer_secret = "fJGEodwe3KiKUnsYJC3VRndj7jevVvXbK2D5EiJ2nehafRgA6i"

# Pass OAuth details to tweepy's OAuth handler
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

In [None]:
## Streaming tweets - create the Streamobject and to filter tweets according to particular keywords

# Initialize Stream listener
l = MyStreamListener()

# Create your Stream object with authentication
stream = tweepy.Stream(auth, l)

# Filter Twitter Streams to capture data by the keywords:
stream.filter(['clinton', 'trump', 'sanders','cruz'])

In [None]:
## Load and explore your Twitter data

"""
Now that you've got your Twitter data sitting locally in a text file, it's time to explore it! 
This is what you'll do in the next few interactive exercises. 
In this exercise, you'll read the Twitter data into a list: tweets_data.

Be aware that this is real data from Twitter and as such there is always a risk 
    that it may contain profanity or other offensive content 
(in this exercise, and any following exercises that also use real Twitter data).
"""

# Import package
import json

# String of path to file: tweets_data_path
tweets_data_path = 'tweets.txt'

# Initialize empty list to store tweets: tweets_data
tweets_data = []

# Open connection to file
tweets_file = open(tweets_data_path, "r")

# Read in tweets and store in list: tweets_data
for line in tweets_file:
    tweet = json.loads(line)
    tweets_data.append(tweet)

# Close connection to file
tweets_file.close()

# Print the keys of the first tweet dict
print(tweets_data[0].keys())

In [None]:
## Twitter data to DataFrame

# Import package
import pandas as pd

# Build DataFrame of tweet texts and languages
df = pd.DataFrame(tweets_data, columns=['text','lang'])

# Print head of DataFrame
print(df.head())

In [None]:
## A little bit of Twitter text analysis

"""
Now that you have your DataFrame of tweets set up, 
    you're going to do a bit of text analysis to count how many tweets contain the words 
        'clinton', 'trump', 'sanders' and 'cruz'. 
In the pre-exercise code, we have defined the following function word_in_text(), 
    which will tell you whether the first argument (a word) occurs within the 2nd argument (a tweet).

You're going to iterate over the rows of the DataFrame and calculate how many tweets contain each of our keywords! 
The list of objects for each candidate has been initialized to 0.
"""


import re

def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match = re.search(word, text)

    if match:
        return True
    return False


# Initialize list to store tweet counts
[clinton, trump, sanders, cruz] = [0, 0, 0, 0]

# Iterate through df, counting the number of tweets in which
# each candidate is mentioned
for index, row in df.iterrows():
    clinton += word_in_text('clinton', row['text'])
    trump += word_in_text('trump', row['text'])
    sanders += word_in_text('sanders', row['text'])
    cruz += word_in_text('cruz', row['text'])

In [None]:
## Plotting your Twitter data

# Import packages
import matplotlib.pyplot as plt 
import seaborn as sns


# Set seaborn style
sns.set(color_codes=True)

# Create a list of labels:cd
cd = ['clinton', 'trump', 'sanders', 'cruz']

# Plot the bar chart
ax = sns.barplot(cd, [clinton, trump, sanders, cruz])
ax.set(ylabel="count")
plt.show()