## Wikipedia Table Extraction Code

In [None]:
import os
import requests
import urllib
import math
import copy
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup


class nfl_wiki_table(object):
    def __init__(self, url):

        self.url = url
        self.r = requests.get(self.url)
        self.url_soup = BeautifulSoup(self.r.text, 'html.parser')

    def read(self):

        self.tables = []
        self.tables_html = self.url_soup.find_all("table", {"class": "navbox plainrowheaders wikitable"})

        # Parse each table
        for n in range(0, len(self.tables_html)):

            n_cols = 0
            n_rows = 0
            
            for row in self.tables_html[n].find_all("tr"):
                col_tags = row.find_all(["td", "th"])
                if len(col_tags) > 0:
                    n_rows += 1
                    if len(col_tags) > n_cols:
                        n_cols = len(col_tags)

            # Create dataframe
            df = pd.DataFrame(index=range(0, n_rows), columns=range(0, n_cols))

            # Create list to store rowspan values
            rowspan_count = [0 for i in range(0, n_cols)]
            
            # Start by iterating over each row in this table...
            row_counter = 0
            for row in self.tables_html[n].find_all("tr"):

                # Skip row if it's blank
                if len(row.find_all(["td", "th"])) == 0:
                    next

                else:

                    # Get all cells containing data in this row
                    columns = row.find_all(["td", "th"])
                    col_dim = []
                    row_dim = []
                    col_dim_counter = -1
                    row_dim_counter = -1
                    col_counter = -1
                    copy_rowspan_count = copy.deepcopy(rowspan_count)

                    for col in columns:

                        # Determine cell dimensions
                        colspan = col.get("colspan")
                        if colspan is None:
                            col_dim.append(1)
                        else:
                            col_dim.append(int(colspan))
                        col_dim_counter += 1

                        rowspan = col.get("rowspan")
                        if rowspan is None:
                            row_dim.append(1)
                        else:
                            row_dim.append(int(rowspan))
                        row_dim_counter += 1
      
                        # Adjust column counter
                        if col_counter == -1:
                            col_counter = 0
                        else:
                            col_counter = col_counter + col_dim[col_dim_counter - 1]

                        while rowspan_count[col_counter] > 0:
                            col_counter += 1

                        # Get cell contents
                        cell_data = col.get_text()

                        # Insert data into cell
                        df.iat[row_counter, col_counter] = cell_data

                        # Record column skipping index
                        if row_dim[row_dim_counter] > 1:
                            copy_rowspan_count[col_counter] = row_dim[row_dim_counter]

                # Adjust row counter
                row_counter += 1

                # Adjust column skipping index
                rowspan_count = [i - 1 if i > 0 else i for i in copy_rowspan_count]
                               
                

            # Append dataframe to list of tables
            self.tables.append(df)
            
            

        return (self.tables)


## Steps
### a. Calling of the function
### b. Creating the Pandas dataframe
### c. Creating a csv for some manipulation

In [None]:
url = "https://en.wikipedia.org/wiki/National_Football_League"
nfl = nfl_wiki_table(url)
wikitable = nfl.read()[0]
wikitable.to_csv("NFL.csv", header = False, index = False)

In [None]:
df = pd.read_csv("NFL.csv",encoding = "ISO-8859-1")

## 1. Getting All locations 34 N and 84 W

In [None]:
# Converting the Coordinates to a string
df['Coordinates']=df['Coordinates'].fillna(0).astype(str)

In [None]:
# Regex for cleaning and extracting just the coordinates
cordinateList=[]
start_marker_1  = ' / '
end_marker_1 = '?'
for row in df['Coordinates']:

    if row=='0':
        cordinateList.append(row)
        pass
    else:
        string = row
        start = string.rindex(start_marker_1)+ len(start_marker_1)
        end = string.rindex(end_marker_1)
        cordinateList.append(string[start:end])

se=pd.Series(cordinateList)
df['CleanCordinates'] = pd.DataFrame(se)
#df['CleanCordinates']=cordinateList
df[['Latitude','Longitude']] = df.CleanCordinates.str.split(';', expand=True)
df.CleanCordinates = df.CleanCordinates.str.replace(';','.')

In [None]:
#Converting the Latitude and Longitude to Float for comparision
df['Latitude']=df['Latitude'].fillna(0).astype(float)
df['Longitude']=df['Longitude'].fillna(0).astype(float)

In [None]:
#Output
df[(df['Latitude']>37) & (df['Longitude']>-84)]

## 2. All teams based out of South Division

In [None]:
df.loc[df['Division[55]'] == "South"]

## 3. Stadiums with capacity 50K and 80K

In [None]:
df['Capacity'] = df['Capacity'].str.replace(',', '').fillna(0).astype(int)
df[df['Capacity'].between(50000, 80000, inclusive=True)]

## 4. Image Links in the page

In [None]:
import requests
from bs4 import BeautifulSoup

r=requests.get("https://en.wikipedia.org/wiki/National_Football_League")
c=r.content
soup=BeautifulSoup(c,"lxml")
#all=soup.find_all(["img","src"])[0]
for img in soup.findAll("img"):
    img = img.get('src')
    print(img)