# **Web Scrapping :** <br >
 Web scrapping is a technique used to extract data from websites through an automated process. It is useful techinque when you want to do deal with data in the web page. 

In [0]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

The class **WikiTableDataParser** is a web scrapper which has multiple functionalities.


1.   get_webpage(url) - to get the response form the given webpage(url) 
2.   parse_url(url, tableClass) - process the response of the given url  and return the data present in the tableClass as dataframe 
3.   parse_html_table (table) - return the data present in the tableClass as dataframe 
4.   save_as_csv_local - to save the dataframe as csv file in the local drive(current working directory)
5.   get_folder_list_drive - to get the list of folder in google drive
6.   save_as_csv_drive - to save the dataframe as csv file in the google drive
7.   save_as_csv_tkinter - to save datafram as csv using tkinter(UI)




In [0]:
#WikiTableDataParser class fetch the data present in a table from given URL.
#We need to give the url along with class name of the table in the web page.
#We can save the data from wepage to csv file in the local directory and google drive
#We can also use tkinter to save the data in local directory

import requests
import pandas as pd
from bs4 import BeautifulSoup
from contextlib import closing
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from google.colab import files
from oauth2client.client import GoogleCredentials

class WikiTableDataParser:
  
    def get_webpage(self,url):
      
      #Attempts to get the content at url by making an HTTP GET request with time out .
      #If the content-type of response is some kind of HTML, return the
      #HTML content, otherwise return None.
      try:
          with closing(requests.get(url, timeout=5)) as resp:
              if self.is_good_response(resp):
                  return resp.content
              else:
                  return None

      except RequestException as e:
          self.log_error('Error during requests to {0} : {1}'.format(url, str(e)))
          return None


    def is_good_response(self,resp):
      
       # Returns True if the response seems to be HTML, False otherwise.
        content_type = resp.headers['Content-Type'].lower()
        return (resp.status_code == 200 
                and content_type is not None 
                and content_type.find('html') > -1)


    def log_error(self,e):
      
        #It is always a good idea to log errors. 
        #This function just prints them, but you can
        #make it do anything.
        print(e)
        
    def parse_url(self, url, tableClass):
        response = self.get_webpage(url)
        # raises error for invalid URL
        if response is None:
            raise Exception("Invalid URL")
        soup = BeautifulSoup(response, 'html.parser')
        tables = soup.find_all('table',{"class":tableClass})
        # raises error if data tables with given class are not found 
        if len(tables) <= 0:
           raise Exception("No tables found with clas : "+tableClass)
        return [(index,self.parse_html_table(table))\
                for index,table in enumerate(tables)]   

    def parse_html_table(self, table):
        n_columns = 0
        n_rows=0
        column_names = []

        # Find number of rows and columns in the table
        # we also find the column titles if we can
        for row in table.find_all('tr'):

            # Determine the number of rows in the table
            td_tags = row.find_all('td')
            if len(td_tags) > 0:
                n_rows+=1
                if n_columns == 0:
                    # Set the number of columns for our table
                    n_columns = len(td_tags)

            # Handle column names if we find them
            th_tags = row.find_all('th') 
            if len(th_tags) > 0 and len(column_names) == 0:
                for th in th_tags:
                    column_names.append(th.get_text())

        # raises error if column Titles and no.of.colmuns doesnt match
        if len(column_names) > 0 and len(column_names) != n_columns:
            raise Exception("Column titles do not match the number of columns")

        columns = column_names if len(column_names) > 0 else range(0,n_columns)
        df = pd.DataFrame(columns = columns,
                          index= range(0,n_rows))
        row_marker = 0
        for row in table.find_all('tr'):
            column_marker = 0
            columns = row.find_all('td')
            for column in columns:
                df.iat[row_marker,column_marker] = column.get_text()
                column_marker += 1
            if len(columns) > 0:
                row_marker += 1

        # Convert to float if possible
        for col in df:
            try:
                df[col] = df[col].astype(float)
            except ValueError:
                pass

        return df
    
    # method to save the data in the local directory
    def save_as_csv_local(self, df, filename):
      try:
        df.to_csv(filename+".csv", index=False, header=True)
        return "Created in the working directory with filename : "+filename+".csv"
      except Exception as e:
        return e
      
    #Method to list folder in drive with id
    def get_folder_list_drive(self):
         try:
          auth.authenticate_user()
          gauth = GoogleAuth()
          gauth.credentials = GoogleCredentials.get_application_default()
          drive = GoogleDrive(gauth)
          file_list = drive.ListFile({'q': "'root' in parents and trashed=false"}).GetList()
          for file1 in file_list:
            print('title: %s, id: %s' % (file1['title'], file1['id']))
          return file_list
         except Exception as e:
          return e
        
    # method to save the data in the google drive
    def save_as_csv_drive(self, df, filename, folderId):
      try:
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        drive = GoogleDrive(gauth)
        fantasy_df.to_csv(filename+".csv", index=False, header=True)
        file = drive.CreateFile({'parents':[{u'id':folderId}]})
        file.SetContentFile(filename+".csv")
        file.Upload()
        return "Created in the drive with filename : "+filename+".csv"
      except Exception as e:
        return e
      
     # method to save the data in file using UI
    def save_as_csv_tkinter(self, df, filename):
      try:
        import tkinter as tk
        from tkinter import filedialog
        from pandas import DataFrame
        canvas1 = tk.Canvas(root, width = 300, height = 300, bg = 'lightsteelblue2', relief = 'raised')
        canvas1.pack()
        def exportCSV ():
            global df
            export_file_path = filedialog.asksaveasfilename(defaultextension='.csv')
            df.to_csv (export_file_path, index = None, header=True)
        saveAsButton_CSV = tk.Button(text='Export CSV', command=exportCSV, bg='green', fg='white', font=('helvetica', 12, 'bold'))
        canvas1.create_window(150, 150, window=saveAsButton_CSV)
        root.mainloop()
        return "Created in with filename : "+filename+".csv"
      except Exception as e:
        return e   
    

Create an object for the WikiTableDataParser class and call parse_url function with url and table name. <br>
parse_url function returns list of dataframe. using for each loop to go through all table and to find the  our required table.<br>




In [0]:
url = "https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)"
#url = "https://en.wikipedia.org/wiki/List_of_bus_routes_in_London"
#url = "https://www.fantasypros.com/nfl/reports/leaders/qb.php?year=2015"
hp = WikiTableDataParser()
table_list = hp.parse_url(url, "wikitable sortable plainrowheaders")
#table_list = hp.parse_url(url, "table")

print("total no of tables in the page",len(table_list))

for index,table in enumerate(table_list):
  print("index : ", index)
  table[1].head()

After find thing the our required table name index , fetch that dataframe from table_list using **table_list[index][1]**

In [0]:
population_df = table_list[0][1]#[index][1]
population_df.info()

save_as_csv_local () - function pass datafarme , filename which you want to save. The file will be saved in the working directory

In [0]:
hp.save_as_csv_local(population_df,'mycsvfile2')

get_folder_list_drive() function list all the folder in our google drive along with the id 

In [0]:
hp.get_folder_list_drive()

save_as_csv_drive () - function pass datafarme , filename which you want to save and folder id to save the file in google drive

In [0]:
hp.save_as_csv_drive(population_df,'mycsvfile2',FolderID)#folder id in which you want to save the file

save_as_csv_tkinter () - function pass datafarme , filename which you want to save using UI

In [0]:
hp.save_as_csv_tkinter(population_df,'mycsvfile2')