# Import Data From The Web

In this project, we will try to import files from the web. In the first part, we will use wine data from datacamp that already save in amazon. 

In [None]:
from urllib.request import urlretrieve
import pandas as pd

url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'
urlretrieve(url, 'winequality-red.csv')
df = pd.read_csv('winequality-red.csv', sep=';')
print(df.head())

In the first step, we retrieve flat files from the web and save it to our local directory. What if we only need to load the data to python and didn't save it?. Well we can do it using pandas. We already save the url in the url variable in previous step. We can input this url to the read_csv function in pandas.

In [None]:
import matplotlib.pyplot as plt
df = pd.read_csv(url, sep=';')
print(df.head())

pd.DataFrame.hist(df.ix[:,0:1])
plt.xlabel('fixed acidity (g(tartaric acid)/dm$^3$)')
plt.ylabel('count')
plt.show()

We could read excel file from the web. In the next part, we will done this using pandas and excel file from datacamp in amazon. Excel file will be read as a dictionary in python with sheet as the key and the tabels as the dataframe. 

In [None]:
url = 'http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls'
xl = pd.read_excel(url, sheet_name=None)
print(xl.keys())
print(xl['1700'].head())

## Import Data From The Web HTML 

In [None]:
from urllib.request import urlopen, Request
url = 'http://www.datacamp.com/teach/documentation'
request = Request(url)
response = urlopen(request)
print(type(response))

In [None]:
html = response.read()
print(html)
response.close()

In [None]:
import requests
url = 'http://www.datacamp.com/teach/documentation'
r = requests.get(url)
text = r.text
print(text)

## Import HTML data using BeautifulSoup 

In [None]:
import requests
from bs4 import BeautifulSoup
url = 'https://www.python.org/~guido/'
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc, 'lxml')
pretty_soup = soup.prettify()
print(pretty_soup)

In [None]:
guido_title = soup.title
print(guido_title)

guido_text = soup.get_text()
print(guido_text)

In [None]:
a_tags = soup.find_all('a')
for link in a_tags:
    print(link.get('href'))

## JSON File 

In [None]:
import json
with open('data.json') as json_file:
    json_data = json.load(json_file)

print(json_data.keys())
print(json_data)

# API (Application programming Interface)

In [None]:
import requests
url = 'http://www.omdbapi.com/?apikey=????????&t=the+social+network'
r = requests.get(url)
print(r.text)

In [None]:
json_data = r.json()
for k in json_data.keys():
    print(k + ':', json_data[k])

In [None]:
url = 'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=pizza'
r = requests.get(url)
json_data = r.json()
print(json_data['query']['pages']['24768']['extract'])

In [None]:
import tweepy
# Store OAuth authentication credentials in relevant variables
access_token = "-"
access_token_secret = "-"
consumer_key = "-"
consumer_secret = "-"

# Pass OAuth details to tweepy's OAuth handler
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token,access_token_secret)

In [None]:
class MyStreamListener(tweepy.StreamListener):

        def on_status(self, status):
                print(status.text)
# Initialize Stream listener
l = MyStreamListener()

# Create your Stream object with authentication
stream = tweepy.Stream(auth, l)


# Filter Twitter Streams to capture data by the keywords:
stream.filter(track=['clinton','trump','sanders','cruz'])

In [None]:
# Import package
import json

# String of path to file: tweets_data_path
tweets_data_path = 'tweets3.txt'

# Initialize empty list to store tweets: tweets_data
tweets_data = []

# Open connection to file
tweets_file = open(tweets_data_path, "r")

# Read in tweets and store in list: tweets_data
for line in tweets_file:
    tweet = json.loads(line)
    tweets_data.append(tweet)

# Close connection to file
tweets_file.close()

# Print the keys of the first tweet dict
print(tweets_data[0].keys())

In [None]:
# Import package
import pandas as pd

# Build DataFrame of tweet texts and languages
df = pd.DataFrame(tweets_data, columns=['text','lang'])

# Print head of DataFrame
print(df.head())

In [None]:
import re

def word_in_text(word, text):
    word = word.lower()
    match = re.search(word, text)

    if match:
        return True
    return False
# Initialize list to store tweet counts
[clinton, trump, sanders, cruz] = [0, 0, 0, 0]
# Iterate through df, counting the number of tweets in which
# each candidate is mentioned
for index, row in df.iterrows():
    clinton += word_in_text('clinton', row['text'].lower())
    trump += word_in_text('trump', row['text'].lower())
    sanders += word_in_text('sanders', row['text'].lower())
    cruz += word_in_text('cruz', row['text'].lower())


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
cd = ['clinton', 'trump', 'sanders', 'cruz']

ax = sns.barplot(cd, [clinton, trump, sanders, cruz])
ax.set(ylabel='count')
plt.show()