### Perform Necessary Installations

In [None]:
!pip install bs4
!pip install requests
!pip install pandas
!pip numpy

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import re
import numpy as np

### Define Helper Functions
We will use those throughout our notebook as they make our code resuable, extensible, and readable 

In [3]:
def flatten_comprehension(matrix):
     return [item for row in matrix for item in row]


def extract_rows_from_tables(tables):
    rows = []
# Now, iterate over each table in the ResultSet
    for table in tables:
        # For each table, you can now call find_all on it
        rows.append(table.find_all('tr'))
        # Do something with rows, e.g., print them
        
    
    return rows

def extract_raw_data(column_data,recurse_thru_a = False, find_tags = 'td', extract_provider_links = False, pattern =''):
    raw_data = []
    for row in column_data:
        row_data = row.find_all(find_tags)
        individual_row_data = []
        
        if(recurse_thru_a == True):
            extracted_links = [data.find('a')['href'] if data.find('a') else data.text.strip() for data in row_data]
#             print("extracted links: ", extracted_links)
            link_to_page = extracted_links[0] if extracted_links else None
#             print(f'Link to page: {link_to_page}')
            individual_row_data.append(link_to_page)
        # print("row_data[0] = ", row_data[0])
        individual_row_data +=[data.text.strip() for data in row_data]
        
        if extract_provider_links == True:
            extracted_links = [data.find('a')['href'] if data.find('a') else data.text.strip() for data in row_data]
            for element in extracted_links:
                matches = re.findall(pattern, element)
                if(matches):
                    individual_row_data.append(element)

        raw_data.append(individual_row_data)
    return raw_data
def extract_providers_link(column_data, find_tags = 'td',  pattern = ''):
    raw_data = []
    for row in column_data:
        row_data = row.find_all(find_tags)
        individual_row_data = []
        
       
        extracted_links = [data.find('a')['href'] if data.find('a') else data.text.strip() for data in row_data]
        
        
#         print(extracted_links)
        for element in extracted_links:
            matches = re.findall(pattern, element)
            if(matches):
                # print(f"'{element}' contains {len(matches)} occurrence(s) of {pattern}")
                individual_row_data +=[data.text.strip() for data in row_data]
                individual_row_data.append(element)
                raw_data.append(individual_row_data)

                
#         individual_row_data +=[data.text.strip() for data in row_data]
#         print(f'{individual_row_data}')
    return raw_data
def append_data_to_df(df, raw_data):
    for data in raw_data:
#         print(f'data: {data}')
        length = len(df)
        df.loc[length] = data
    return df

def preprocess_extracted_sats(cont_sats_extracted, row_names, attr_index = 0):
    for index, sat in enumerate(cont_sats_extracted):
#         print("len(sat) = > ",len(sat))
#         print("len(row_names) = > ",len(sat))
        
        if len(sat) <= len(row_names) - 1:
#             print(f'found a channel "{sat}"')
            sat_dir = cont_sats_extracted[index - 1][attr_index]
#             print(f'New Attr "{sat_dir}"')
            sat.insert(attr_index, sat_dir)
            cont_sats_extracted[index] = sat
#             print(f'New channel "{cont_sats_extracted[index]}"')

    return cont_sats_extracted

def append_region_to_extracted_sats(cont_sats_extracted, continent):
    for index, sat in enumerate(cont_sats_extracted):
        cont_sats_extracted[index] =  sat + [continent]
    return cont_sats_extracted

In [694]:
def extract_raw_data_alt(column_data,recurse_thru_a = False, find_tags = 'td', extract_provider_links = False, pattern =''):
    raw_data = []
    for row in column_data:
        row_data = row.find_all(find_tags)
        columns = row_data
        individual_row_data = []
        
        
        if(recurse_thru_a == True):
            extracted_links = [data.find('a')['href'] if data.find('a') else data.text.strip() for data in row_data]
            link_to_page = extracted_links[0] if extracted_links else None
            individual_row_data.append(link_to_page)

        individual_row_data +=[data.text.strip() for data in row_data]
        
       
        # print(len(columns))
        if len(columns) == 10:
            font = columns[0].find_all('font')[0]
            # print('font: ', font)
            
            br = font.find_all('br')
            # print('br: ', br)
            freq = None
            beam = None
            eirp = None
            for i, thing in enumerate(br):
                if i == 0:
                    freq = thing.previous_sibling.get_text(strip=True)
                elif i == 1:
                    beam = thing.previous_sibling.get_text(strip=True)
                else:
                    eirp = thing.next_sibling.get_text(strip=True) if thing.next_sibling else None
            # print(f'freq: {freq}, beam: {beam}, eirp: {eirp}')
        individual_row_data.append(freq)
        individual_row_data.append(beam)
        individual_row_data.append(eirp)
        
        if len(columns) == 10:
            fonts = columns[8].find_all('font')
            # print('fonts for network: ', fonts)
            fonts = [bs(str(data).replace('<br/>',',')).text for data in fonts]
            first_font = fonts[0] if fonts else None
            individual_row_data.append(first_font)
        else:
            fonts = columns[6].find_all('font')
            fonts = [bs(str(data).replace('<br/>',',')).text for data in fonts]
            first_font = fonts[0] if fonts else None
            individual_row_data.append(first_font)
            # individual_row_data += fonts.pop()
        # print('fonts: ', fonts)
                        
        # individual_row_data +=[data.text.strip() for data in row_data]
        
        # individual_row_data +=[BeautifulSoup(str(data).replace('<br/>',',')).text for data in row_data]
        
        if extract_provider_links == True:
            extracted_links = [data.find('a')['href'] if data.find('a') else data.text.strip() for data in row_data]
            for element in extracted_links:
                matches = re.findall(pattern, element)
                if(matches):
                    individual_row_data.append(element)

        raw_data.append(individual_row_data)
    return raw_data

## Extract Satellites for All Regions

### List Continents and concatenate with their url

In [None]:
continents = ['asia', 'europe', 'atlantic', 'america']
baseurl = 'https://www.lyngsat.com/'

# Construct the URLs
endpoints = []
for c in continents:
    endpoints.append(f'{baseurl}{c}.html')
endpoints

### Scrape each Endpoint

In [None]:
satellites = []
for index, c in enumerate(continents):
    page = requests.get(endpoints[index])
    soup = bs(page.text, 'html')
    
    
    satellites.append({f'{c}':soup})

In [None]:
row_names = ['Position','Name','Frequency','Launch_Date', 'Region']
cont_sats_df = pd.DataFrame(columns = row_names)
# 
for index, cont in enumerate(continents):
#     print('continent: ',cont)
    cont_sats_raw = satellites[index][cont]
    table = cont_sats_raw.find_all('table')[11]
    
    column_data = table.find_all('tr')
    
    cont_sats_extracted = extract_raw_data(column_data)
    
    cont_sats_extracted = preprocess_extracted_sats(cont_sats_extracted, row_names)
    cont_sats_extracted = append_region_to_extracted_sats(cont_sats_extracted, cont)    

    cont_sats_df = append_data_to_df(cont_sats_df, cont_sats_extracted)
    
# Switch the first (position) with the second column (name)
cont_sats_df = cont_sats_df.iloc[:, [1, 0] + list(range(2, len(cont_sats_df.columns)))]


In [None]:
cont_sats_df

### Get Launching Details

In [None]:
launch_endpoint = baseurl + "launches/index.html"

page = requests.get(launch_endpoint)
soup = bs(page.text, 'html')



In [None]:
table = soup.find_all('table')[15]
column_data = table.find_all('tr')

sats_extracted = extract_raw_data(column_data)

sats_extracted = [[entry[2], entry[3]] for entry in sats_extracted]
row_names = ['Sat_Name', 'Rocket']
rockets_df = pd.DataFrame(columns = row_names)

rockets_df = append_data_to_df(rockets_df, sats_extracted)
# rockets_df

merged_df = pd.merge(cont_sats_df, rockets_df, left_on = 'Name', right_on = 'Sat_Name', how='left')
# merged_df = merged_df.fillna(None)

merged_df

In [None]:
cont_sats_df[['Position_Longitude', 'Position_Direction']] = cont_sats_df['Position'].str.split('\u00b0', expand=True)
cont_sats_df.drop('Position', axis=1, inplace=True)

cont_sats_df


In [None]:
cont_sats_df.to_csv('E:/AUC/23-24/Spring/Database/Project/CSV Files/Satellites.csv',index=False)

In [None]:
continents = ['asia', 'europe', 'atlantic', 'america']
baseurl = 'https://www.lyngsat.com/tracker/'

# Construct the URLs
endpoints = []
for c in continents:
    endpoints.append(f'{baseurl}{c}.html')
endpoints

In [None]:
sat_rockets = []
for index, c in enumerate(continents):
    page = requests.get(endpoints[index])
    soup = bs(page.text, 'html')
    
    
    sat_rockets.append({f'{c}':soup})

In [None]:
sat_rockets

In [None]:
row_names = ['URL','Position','Name','Frequency', 'Launch Date', 'Region']
cont_sats_df = pd.DataFrame(columns = row_names)

for index, cont in enumerate(continents):

    cont_sats_rockets = sat_rockets[index][cont]
# cont_sats_rockets = sat_rockets[2]['atlantic']

#     print(cont_sats_rockets)

    table = cont_sats_rockets.find_all('table')[11]
#     print(table)
    
    column_data = table.find_all('tr')
    
#     print(column_data)
    
    cont_sats_extracted = extract_raw_data(column_data, True)

#     cont_sats_extracted = preprocess_extracted_sats(cont_sats_extracted, row_names)
    cont_sats_extracted = append_region_to_extracted_sats(cont_sats_extracted, cont)    
    
    cont_sats_df = append_data_to_df(cont_sats_df, cont_sats_extracted)
    
    
#     print(cont_sats_df)
# Switch the first (position) with the second column (name)
# cont_sats_df = cont_sats_df.iloc[:, [1, 0] + list(range(2, len(cont_sats_df.columns)))]
cont_sats_df


In [None]:
cont_sats_df


In [None]:
cont_sats_df['URL']

In [None]:
rocket_list = []
for index, cont in enumerate(cont_sats_df['URL']):
    url = cont_sats_df['URL'][index]
    page = requests.get(url)
    soup = bs(page.text, 'html')
#     soup.find_all('font')[13]

    entry  = soup.find_all('font')
    
    entry = ' '.join(map(str,entry))

    x = re.search("launched with (.+) \d\d\d\d", entry)
#     print(x.group(1))
    
    sat_name = cont_sats_df['Name'][index]
    
    print(f'Satellite name: {sat_name}')
    if(x):
#         print(f'Found l')
        rocket_list.append(
            {
            f'{sat_name}': x.group(1)
        })
    else:
        print(f'Couldnt find a launching rocket for {sat_name} with entry {entry}')
        
    print(f'Processed {index + 1} Satellites out of {len(cont_sats_df.index)}')
    

In [None]:
rocket_list

In [None]:
rocket_list


list_of_lists = [list(d.items())[0] for d in rocket_list]
# new_list
row_names = ['Sat_name', 'L_Rocket']

rockets_df = pd.DataFrame(list_of_lists,columns = row_names)
list_of_lists

# new_rockets_df

In [None]:
sat_df = pd.read_csv('E:/AUC/23-24/Spring/Database/Project/CSV Files/Satellites.csv')
# rockets_df = pd.read_csv('E:/AUC/23-24/Spring/Database/Project/CSV Files/Satellites_Rockets.csv')

# merged_df = pd.merge(sat_df, rockets_df, left_on = 'Name', right_on = 'Sat_name', how = 'left')
# merged_df = merged_df.drop(columns=['Sat_name'])
merged_df = pd.read_csv('E:/AUC/23-24/Spring/Database/Project/CSV Files/Satellites_Rockets.csv')

## Scrape Channels and Providers
For each satellite, we:
1. Scrape the Channels/ Providers records, treating them as one.
    a. Identify the Providers and merge with the original dataframe
    b. Save that
2. Assign Providers to Channels
3. Clean the Dataframes
    a. Split the System/SR/FEC columns
    b. Extract the languages
    c. Extract the Frequency

We begin by defining *helper functions* & *attributes*

In [692]:
# Retrieves data tables from a given url
def get_table(url):
    
    page = requests.get(url)
    soup = bs(page.text, 'html')

    table = soup.find_all('table',{'border':"", 'cellpadding':"0", 'cellspacing':"0", 'width':"720"} )

    return table


In [530]:
baseurl = 'https://www.lyngsat.com/'
# Contains the names of our Satellites
merged_df = pd.read_csv('E:/AUC/23-24/Spring/Database/Project/CSV Files/Satellites_Rockets.csv')

In [531]:

# Prepare satellite names for incorporation into URLs

raw_sat_names = [link.replace(" ", "-") for link in merged_df['Name']]
raw_sat_names = [link.replace("'", "" ) for link in raw_sat_names]
raw_sat_names = [link.replace("ü", "u" ) for link in raw_sat_names]
raw_sat_names = [link.replace("/", "-" ) for link in raw_sat_names]
raw_sat_names = [link.replace("Ä", "A" ) for link in raw_sat_names]

sat_url_names = [re.split('-\(',link)[0] for link in raw_sat_names]
# baseurl

  sat_url_names = [re.split('-\(',link)[0] for link in raw_sat_names]


In [None]:
sat_url_names

### Scrape Channels & Providers

In [727]:
## ALT SINGLE CHANNEL

## SCRAPE CHANNELS + PROVIDERS

# for index, sat_name in enumerate(sat_url_names):

index = 1
sat_name = 'Intelsat-18'
url = f'{baseurl}{sat_name}.html'
print(f'{index + 1} Processing satellite {sat_name} with url {url}')

# Scrape endpoint, retrieving all data tables 
sat_table = get_table(url)

# Retrieve all tr's from the returned tables
# Returns a list of lists (/table)
column_data = extract_rows_from_tables(sat_table)

for i, table in enumerate(column_data):
#     print("table#",i)
    if(len(table) > 2):
        _temp = column_data[i]
        _temp = _temp[2:len(_temp)-1]
    #     print(_temp)
        column_data[i] = _temp

# Flattens the list. Now, we have a list of tr tags
column_data = flatten_comprehension(column_data)

# Clean and extract the data values from the tags
channels_extracted = extract_raw_data_alt(column_data)
chans_links = extract_providers_link(column_data,pattern='.*tvchannels|.*radiochannels')
chan_links_df = pd.DataFrame(data=chans_links)
chan_links_df = chan_links_df.iloc[:, [1,8]]
chan_links_df.rename(columns ={1: "Channel", 8: "Channel_URL"}, inplace=True)
# print(chan_links_df)
networks_extracted = extract_providers_link(column_data, pattern='.*providers')



# Define the columns for our Main Dataframe
row_names = ['Freq/beam','SR/FEC', 'SID', 'Provider/Channel','undef','Compression','VPID','Audio', 'Encryption', 'Src_Updated', 'freq_alt', 'beam_alt', 'eirp_alt','encryption_alt']

# Define the columns for our Networks Dataframe
netw_row_names = ['Freq/beam','SR/FEC', 'SID', 'Provider/Channel','undef','Compression','VPID','Audio', 'Encryption', 'Src_Updated', 'Provider_URL']

# Construct the Network Dataframe
netw_df = pd.DataFrame(networks_extracted, columns = netw_row_names)

# Add Frequency/Beam to our channels (as inherited from the preceeding element)
channels_extracted = preprocess_extracted_sats(channels_extracted, row_names)

# Add System/SR/FEC to our channels (as inherited from the preceeding element)
channels_extracted = preprocess_extracted_sats(channels_extracted, row_names, 1)

# Construct the Main Dataframe
sat_df = pd.DataFrame(channels_extracted, columns = row_names)    

# Refine the Networks Dataframe to remove clutter/ redundant attributes
netw_df = netw_df[['Provider/Channel', 'Provider_URL']]

# Merge the Main and Network Dataframes
merged_df = pd.merge(left=sat_df, right=netw_df, how='outer',left_on='Provider/Channel', right_on='Provider/Channel')

# Adjust the filename to be safe (for saving the file on Windows)
safe_sat_name = sat_name.replace('/', '-') # Replace '/' with '_'
merged_df = pd.merge(left=merged_df, right=chan_links_df, left_on='Provider/Channel', right_on='Channel',how='outer')
# Save the merged Dataframe
merged_df.to_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v4/{index + 1}_{safe_sat_name}_channels.csv',index=False)


# temp_df = 
# print(temp_df)


2 Processing satellite Intelsat-18 with url https://www.lyngsat.com/Intelsat-18.html


In [731]:
## ALT

## SCRAPE CHANNELS + PROVIDERS

for index, sat_name in enumerate(sat_url_names):

    url = f'{baseurl}{sat_name}.html'
    print(f'{index + 1} Processing satellite {sat_name} with url {url}')

    # Scrape endpoint, retrieving all data tables 
    sat_table = get_table(url)

    # Retrieve all tr's from the returned tables
    # Returns a list of lists (/table)
    column_data = extract_rows_from_tables(sat_table)

    for i, table in enumerate(column_data):
    #     print("table#",i)
        if(len(table) > 2):
            _temp = column_data[i]
            _temp = _temp[2:len(_temp)-1]
        #     print(_temp)
            column_data[i] = _temp

    # Flattens the list. Now, we have a list of tr tags
    column_data = flatten_comprehension(column_data)
   
    # Clean and extract the data values from the tags
    channels_extracted = extract_raw_data_alt(column_data)
    
    
    
    networks_extracted = extract_providers_link(column_data, pattern='.*providers')
    


    # Define the columns for our Main Dataframe
    row_names = ['Freq/beam','SR/FEC', 'SID', 'Provider/Channel','undef','Compression','VPID','Audio', 'Encryption', 'Src_Updated', 'freq_alt', 'beam_alt', 'eirp_alt','encryption_alt']
    
    # Define the columns for our Networks Dataframe
    netw_row_names = ['Freq/beam','SR/FEC', 'SID', 'Provider/Channel','undef','Compression','VPID','Audio', 'Encryption', 'Src_Updated', 'Provider_URL']
    
    # Construct the Network Dataframe
    netw_df = pd.DataFrame(networks_extracted, columns = netw_row_names)
    
    # Add Frequency/Beam to our channels (as inherited from the preceeding element)
    channels_extracted = preprocess_extracted_sats(channels_extracted, row_names)

    # Add System/SR/FEC to our channels (as inherited from the preceeding element)
    channels_extracted = preprocess_extracted_sats(channels_extracted, row_names, 1)

    # Construct the Main Dataframe
    sat_df = pd.DataFrame(channels_extracted, columns = row_names)    
    
    # Refine the Networks Dataframe to remove clutter/ redundant attributes
    netw_df = netw_df[['Provider/Channel', 'Provider_URL']]

    # Merge the Main and Network Dataframes
    merged_df = pd.merge(left=sat_df, right=netw_df, how='outer',left_on='Provider/Channel', right_on='Provider/Channel')

    # Adjust the filename to be safe (for saving the file on Windows)
    safe_sat_name = sat_name.replace('/', '-') # Replace '/' with '_'

    try:
        chans_links = extract_providers_link(column_data,pattern='.*tvchannels|.*radiochannels')
        chan_links_df = pd.DataFrame(data=chans_links)
        chan_links_df = chan_links_df.iloc[:, [1,8]]
        chan_links_df.rename(columns ={1: "Channel", 8: "Channel_URL"}, inplace=True)
        merged_df = pd.merge(left=merged_df, right=chan_links_df, left_on='Provider/Channel', right_on='Channel',how='outer')
    except:
        print(f'Problems with satellite: {safe_sat_name}')


    # Save the merged Dataframe
    merged_df.to_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v5/{index + 1}_{safe_sat_name}_channels.csv',index=False)



1 Processing satellite NSS-9 with url https://www.lyngsat.com/NSS-9.html
2 Processing satellite Intelsat-18 with url https://www.lyngsat.com/Intelsat-18.html
3 Processing satellite Eutelsat-172B with url https://www.lyngsat.com/Eutelsat-172B.html
4 Processing satellite Horizons-3e with url https://www.lyngsat.com/Horizons-3e.html
5 Processing satellite Intelsat-19 with url https://www.lyngsat.com/Intelsat-19.html
6 Processing satellite ChinaSat-19 with url https://www.lyngsat.com/ChinaSat-19.html
Problems with satellite: ChinaSat-19
7 Processing satellite Superbird-B3 with url https://www.lyngsat.com/Superbird-B3.html
8 Processing satellite Optus-D2 with url https://www.lyngsat.com/Optus-D2.html
9 Processing satellite ABS-6 with url https://www.lyngsat.com/ABS-6.html
10 Processing satellite Optus-D3 with url https://www.lyngsat.com/Optus-D3.html
Problems with satellite: Optus-D3
11 Processing satellite Optus-10 with url https://www.lyngsat.com/Optus-10.html
12 Processing satellite JCSA

In [None]:
## SCRAPE CHANNELS + PROVIDERS

for index, sat_name in enumerate(sat_url_names):

    url = f'{baseurl}{sat_name}.html'
    print(f'{index + 1} Processing satellite {sat_name} with url {url}')

    # Scrape endpoint, retrieving all data tables 
    sat_table = get_table(url)

    # Retrieve all tr's from the returned tables
    # Returns a list of lists (/table)
    column_data = extract_rows_from_tables(sat_table)

    for i, table in enumerate(column_data):
    #     print("table#",i)
        if(len(table) > 2):
            _temp = column_data[i]
            _temp = _temp[2:len(_temp)-1]
        #     print(_temp)
            column_data[i] = _temp

    # Flattens the list. Now, we have a list of tr tags
    column_data = flatten_comprehension(column_data)
   
    # Clean and extract the data values from the tags
    channels_extracted = extract_raw_data(column_data)
    networks_extracted = extract_providers_link(column_data, pattern='.*providers')
    


    # Define the columns for our Main Dataframe
    row_names = ['Freq/beam','SR/FEC', 'SID', 'Provider/Channel','undef','Compression','VPID','Audio', 'Encryption', 'Src_Updated']
    
    # Define the columns for our Networks Dataframe
    netw_row_names = ['Freq/beam','SR/FEC', 'SID', 'Provider/Channel','undef','Compression','VPID','Audio', 'Encryption', 'Src_Updated', 'Provider_URL']
    
    # Construct the Network Dataframe
    netw_df = pd.DataFrame(networks_extracted, columns = netw_row_names)
    
    # Add Frequency/Beam to our channels (as inherited from the preceeding element)
    channels_extracted = preprocess_extracted_sats(channels_extracted, row_names)

    # Add System/SR/FEC to our channels (as inherited from the preceeding element)
    channels_extracted = preprocess_extracted_sats(channels_extracted, row_names, 1)

    # Construct the Main Dataframe
    sat_df = pd.DataFrame(channels_extracted, columns = row_names)    
    
    # Refine the Networks Dataframe to remove clutter/ redundant attributes
    netw_df = netw_df[['Provider/Channel', 'Provider_URL']]

    # Merge the Main and Network Dataframes
    merged_df = pd.merge(left=sat_df, right=netw_df, how='outer',left_on='Provider/Channel', right_on='Provider/Channel')

    # Adjust the filename to be safe (for saving the file on Windows)
    safe_sat_name = sat_name.replace('/', '-') # Replace '/' with '_'

    # Save the merged Dataframe
    merged_df.to_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v2/{index + 1}_{safe_sat_name}_channels.csv',index=False)



In [None]:
## SCRAPE A CHANNEL
# useful for debugging

index = 1
sat_name = 'Intelsat-18'
# for index, sat_name in enumerate(sat_url_names):

url = f'{baseurl}{sat_name}.html'
print(f'{index + 1} Processing satellite {sat_name} with url {url}')

# Scrape endpoint, retrieving all data tables 
sat_table = get_table(url)

# Retrieve all tr's from the returned tables
# Returns a list of lists (/table)

column_data = extract_rows_from_tables(sat_table)


for i, table in enumerate(column_data):
#     print("table#",i)
    if(len(table) > 2):
        _temp = column_data[i]
        _temp = _temp[2:len(_temp)-1]
    #     print(_temp)
        column_data[i] = _temp

# Flattens the list. Now, we have a list of tr tags
column_data = flatten_comprehension(column_data)

# freq_beam = extract_freq_beam_eirp(column_data)

# print('freq_beam: ', freq_beam)
# Clean and extract the data values from the tags
channels_extracted = extract_raw_data(column_data)
networks_extracted = extract_providers_link(column_data, pattern='.*providers')

freq_beam = extract_freq_beam_eirp(column_data, ['France TV', 'Anuvu', 'American Forces Network'])
# print(freq_beam)

# Define the columns for our Main Dataframe
row_names = ['Freq/beam','SR/FEC', 'SID', 'Provider/Channel','undef','Compression','VPID','Audio', 'Encryption', 'Src_Updated']

# Define the columns for our Networks Dataframe
netw_row_names = ['Freq/beam','SR/FEC', 'SID', 'Provider/Channel','undef','Compression','VPID','Audio', 'Encryption', 'Src_Updated', 'Provider_URL']

freq_beam_rows = ['Provider', 'Frequency', 'Beam', 'EIRP']

freq_beam_df = pd.DataFrame(freq_beam, columns=freq_beam_rows)
# Construct the Network Dataframe
netw_df = pd.DataFrame(networks_extracted, columns = netw_row_names)
# Refine the Networks Dataframe to remove clutter/ redundant attributes
netw_df = netw_df[['Provider/Channel', 'Provider_URL']]

# Add Frequency/Beam to our channels (as inherited from the preceeding element)
channels_extracted = preprocess_extracted_sats(channels_extracted, row_names)

# Add System/SR/FEC to our channels (as inherited from the preceeding element)
channels_extracted = preprocess_extracted_sats(channels_extracted, row_names, 1)

merged_netw_df = pd.merge(left=netw_df, right=freq_beam_df, how='inner', left_on='Provider/Channel', right_on='Provider')
print(merged_netw_df)
# Construct the Main Dataframe
sat_df = pd.DataFrame(channels_extracted, columns = row_names)    

# sat_df_copy = sat_df.copy()

merged_sat_df = pd.merge(sat_df, merged_netw_df, how='outer', on='Provider/Channel')

merged_sat_df['Frequency'] = merged_sat_df['Frequency'].fillna(method='ffill')
merged_sat_df['Beam'] = merged_sat_df['Beam'].fillna(method='ffill')
merged_sat_df['EIRP'] = merged_sat_df['EIRP'].fillna(method='ffill')

# merged_sat_df['Freq'] = merged_sat_df['Freq_y']
# merged_sat_df['Beam'] = merged_sat_df['Beam_y']
# merged_sat_df['EIRP'] = merged_sat_df['EIRP_y']

# merged_sat_df = merged_sat_df.drop(columns=['Freq_y', 'Beam_y', 'EIRP_y'])


# sat_df['Freq', 'Beam', 'EIRP'] = np.where(sat_df['Freq/Beam'] == ) 

# # Refine the Networks Dataframe to remove clutter/ redundant attributes
# netw_df = netw_df[['Provider/Channel', 'Provider_URL']]

# Merge the Main and Network Dataframes
# merged_df = pd.merge(left=sat_df, right=netw_df, how='outer',left_on='Provider/Channel', right_on='Provider/Channel')

# Adjust the filename to be safe (for saving the file on Windows)
safe_sat_name = sat_name.replace('/', '-') # Replace '/' with '_'

# merged_sat_df.drop(columns='Provider', axis=1, inplace=True)


# Save the merged Dataframe
merged_sat_df.to_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v3/{index + 1}_{safe_sat_name}_channels.csv',index=False)



In [None]:


# merged_netw_df
merged_sat_df
# merged_sat_df.to_csv('E:/AUC/23-24/Spring/Database/Project/CSV Files/aliali.csv')

### Assign Providers to Channels

In [732]:

for index, sat_name in enumerate(sat_url_names):
#     index = 1
#     sat_name = 'Intelsat-18'
    safe_sat_name = sat_name.replace('/', '-') # Replace '/' with '_'
    my_df = pd.read_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v5/{index + 1}_{safe_sat_name}_channels.csv')

    # if len(my_df.index) == 0:
    #     continue
    print(f'{index + 1} Processing sat: {safe_sat_name}')

    my_df['is_Provider'] = np.where(my_df['Provider_URL'].astype(str).str.contains('http', regex=True, na=False), True, False)

    providers_df = my_df[my_df['is_Provider'] == True]
    channels_df = my_df[my_df['is_Provider'] == False]

    my_df['Provider'] = None


    for i in range(len(my_df.index)):
        isProvider = my_df.loc[i, 'is_Provider']
    
        if isProvider == True:
            my_df.loc[i, 'Provider'] = my_df.loc[i, 'Provider/Channel']
            continue

        if i > 0:
            my_df.loc[i, 'Provider'] = my_df.loc[i-1, 'Provider'] 
    my_df.to_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v5/assigned/{index + 1}_{safe_sat_name}_channels.csv',index=False)

# my_df

1 Processing sat: NSS-9
2 Processing sat: Intelsat-18
3 Processing sat: Eutelsat-172B
4 Processing sat: Horizons-3e
5 Processing sat: Intelsat-19
6 Processing sat: ChinaSat-19
7 Processing sat: Superbird-B3
8 Processing sat: Optus-D2
9 Processing sat: ABS-6
10 Processing sat: Optus-D3
11 Processing sat: Optus-10
12 Processing sat: JCSAT-2B
13 Processing sat: Optus-D1
14 Processing sat: BRIsat
15 Processing sat: JCSAT-1C
16 Processing sat: Nusantara-Satu
17 Processing sat: Express-AMU7
18 Processing sat: JCSAT-16
19 Processing sat: Superbird-C2
20 Processing sat: Apstar-9
21 Processing sat: Express-AM5
22 Processing sat: Express-AT2
23 Processing sat: Telstar-18-Vantage
24 Processing sat: Apstar-6D
25 Processing sat: Apstar-6C
26 Processing sat: JCSAT-12
27 Processing sat: JCSAT-5A
28 Processing sat: Vinasat-1
29 Processing sat: Vinasat-2
30 Processing sat: ChinaSat-6C
31 Processing sat: ChinaSat-2D
32 Processing sat: LaoSat-1
33 Processing sat: JCSAT-3A
34 Processing sat: Cosmos-2526
3

### Clean our Dataframes

##### Define Helper Functions

In [733]:


# Function to extract string until the last capital character
def extract_until_last_capital(s):
    match = re.search(r'(.*[A-Z])', s)
    if match:
        return match.group(1)
    return s
def find_pattern_and_join(entry):
    matches = re.findall(r"([A-Z][a-z]+)", entry)
    return ' '.join(matches) # Join the matches into a single string
def split_beam_eirp(entry, beam_eirp_index = 0):
    temp = re.split(r'(?<=\d)(?=\D)', entry)
    if beam_eirp_index == 1:
        # Check if the value is not None
        if(len(temp) > 1):
            return temp[beam_eirp_index] 
        else:
            'None'
    else:
        return temp[beam_eirp_index]

In [734]:
skipped_sats_url = []

#### Perform The Cleaning

In [735]:


for index, sat_name in enumerate(sat_url_names):
    safe_sat_name = sat_name.replace('/', '-') # Replace '/' with '_'
    hor_df = pd.read_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v5/assigned/{index + 1}_{safe_sat_name}_channels.csv')

    print(f'{index + 1} Processing sat: {safe_sat_name}')

    if len(hor_df.index) == 0:
        url = f'{baseurl}{sat_name}.html'
        skipped_sats_url.append(url)
        hor_df.to_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v5/assigned/cleaned/{index+1}_{safe_sat_name}_channels.csv', index=False)
        continue
    
    # Drop irrelevant columns
    hor_df.drop(columns=['undef','Src_Updated', 'Freq/beam', 'Encryption'], inplace = True)  
    
    try:
        hor_df['FEC'] = hor_df['SR/FEC'].str[-3:]
        hor_df['SR'] = hor_df['SR/FEC'].str[-8:-3].str.extract('(\d+)')
        hor_df['SYSTEM'] = hor_df['SR/FEC'].str[0:6]
    except:
        print("PROBLEM YO")

    # Drop the now-old composite column
    hor_df.drop('SR/FEC', axis=1, inplace=True)

    # Extract Languages
    hor_df['Audio'] = hor_df['Audio'].astype(str)

    hor_df['Languages'] = hor_df['Audio'].apply(find_pattern_and_join)
    
    hor_df.drop('Audio', axis=1, inplace=True)
    
    # Split the Composite Frequency column    
#     try:
# #         extract_until_last_capital
#         hor_df['Freq'] = hor_df['Freq/beam'].str[0:7].apply(extract_until_last_capital)
#     except:
#         print("PROBLEMSSSSSS")
    
    hor_df.to_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v5/assigned/cleaned/{index+1}_{safe_sat_name}_channels.csv', index=False)    

  hor_df['SR'] = hor_df['SR/FEC'].str[-8:-3].str.extract('(\d+)')


1 Processing sat: NSS-9
2 Processing sat: Intelsat-18
3 Processing sat: Eutelsat-172B
4 Processing sat: Horizons-3e
5 Processing sat: Intelsat-19
6 Processing sat: ChinaSat-19
7 Processing sat: Superbird-B3
8 Processing sat: Optus-D2
9 Processing sat: ABS-6
10 Processing sat: Optus-D3
11 Processing sat: Optus-10
12 Processing sat: JCSAT-2B
13 Processing sat: Optus-D1
14 Processing sat: BRIsat
15 Processing sat: JCSAT-1C
16 Processing sat: Nusantara-Satu
17 Processing sat: Express-AMU7
18 Processing sat: JCSAT-16
19 Processing sat: Superbird-C2
20 Processing sat: Apstar-9
21 Processing sat: Express-AM5
22 Processing sat: Express-AT2
23 Processing sat: Telstar-18-Vantage
24 Processing sat: Apstar-6D
25 Processing sat: Apstar-6C
26 Processing sat: JCSAT-12
27 Processing sat: JCSAT-5A
28 Processing sat: Vinasat-1
29 Processing sat: Vinasat-2
30 Processing sat: ChinaSat-6C
31 Processing sat: ChinaSat-2D
32 Processing sat: LaoSat-1
33 Processing sat: JCSAT-3A
34 Processing sat: Cosmos-2526
3

In [None]:
# View the URLs that were skipped from cleaning due to lack of data
skipped_sats_url

In [737]:
# View the current output for a cleaned dataframe
index = 1
safe_sat_name = 'Intelsat-18'

my_df = pd.read_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v5/assigned/cleaned/{index+1}_{safe_sat_name}_channels.csv')


In [738]:
my_df.head()

Unnamed: 0,SID,Provider/Channel,Compression,VPID,freq_alt,beam_alt,eirp_alt,encryption_alt,Provider_URL,Channel,Channel_URL,is_Provider,Provider,FEC,SR,SYSTEM,Languages
0,,American Forces Network,,,3753 R,tp NH1C,,"PowerVu,Verimatrix",https://www.lyngsat.com/providers/us/American-...,,,True,American Forces Network,2/3,30000.0,DVB-S2,
1,2.0,AFN Prime Atlantic,MPEG-4/HD,1260.0,3753 R,tp NH1C,,"PowerVu,Verimatrix",,AFN Prime Atlantic,https://www.lyngsat.com/tvchannels/us/AFN-Prim...,False,American Forces Network,2/3,30000.0,DVB-S2,Eng
2,3.0,AFN News,MPEG-4/HD,1560.0,3753 R,tp NH1C,,"PowerVu,Verimatrix",,AFN News,https://www.lyngsat.com/tvchannels/us/AFN-News...,False,American Forces Network,2/3,30000.0,DVB-S2,Eng
3,4.0,AFN Sports,MPEG-4/HD,1160.0,3753 R,tp NH1C,,"PowerVu,Verimatrix",,AFN Sports,https://www.lyngsat.com/tvchannels/us/AFN-Spor...,False,American Forces Network,2/3,30000.0,DVB-S2,Eng
4,5.0,AFN Prime Pacific,MPEG-4/HD,1460.0,3753 R,tp NH1C,,"PowerVu,Verimatrix",,AFN Prime Pacific,https://www.lyngsat.com/tvchannels/us/AFN-Prim...,False,American Forces Network,2/3,30000.0,DVB-S2,Eng


## Get Country for Provider

In [755]:


# Retrieves data tables from a given url
def get_country(url):
    
    page = requests.get(url)
    soup = bs(page.text, 'html')

    table = soup.find_all('table',{'width':"700"} )

    return table
def extract_country(url):
    my_pattern = "/\D\D/"
    url = str(url)
    # print('url => ', url)
    matches = re.search(my_pattern, url)
    if matches:
        country = matches.group() # Assuming the country is the first captured group
        country = country[1:-1]
        return country
    else:
        return None # or any default value you prefer



  my_pattern = "/\D\D/"


In [756]:
new_df = pd.DataFrame(columns = ['Sattelite', 'Provider', 'Country'])


In [774]:

## Extract Channel Country
x_df = None

netw_flag = True
for index, sat_name in enumerate(sat_url_names):
    
    safe_sat_name = sat_name.replace('/', '-') # Replace '/' with '_'
    new_col_name = 'Provider_Country'
    ref_col = 'Provider_URL'
    dir_name = 'prov_country'
    if netw_flag == False:
        new_col_name = 'Channel_Country'
        ref_col = 'Channel_URL'
        dir_name = 'chan_country'
        
        # May cause problems; remember to change the chan_country dir
    hor_df = pd.read_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v5/assigned/cleaned/chan_country/{index + 1}_{safe_sat_name}_channels.csv')
    
    # hor_df['Channel_Country'] = None
    print(f'{index + 1} Processing sat: {safe_sat_name}')

    if len(hor_df.index) == 0:
        # url = f'{baseurl}{sat_name}.html'
        # skipped_sats_url.append(url)
        print(f'Skipping sattelite: {safe_sat_name}')
        hor_df.to_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v5/assigned/cleaned/{dir_name}/{index+1}_{safe_sat_name}_channels.csv', index=False)
        continue
    # prov_urls = hor_df['Provider_URL'].dropna().unique()
    

    
    # my_pattern = "/\D\D/"
    # for url in prov_urls:
        
    #     print(f'Processing Provider: {url}')
    #     provider = hor_df[hor_df['Provider_URL'] == url].iloc[0]['Provider/Channel']

    #     matches = re.search(my_pattern, url)
    #     matches = matches.group()
    #     country = matches[1:-1]
    #     new_df.loc[len(new_df)] = [safe_sat_name, provider ,country]
    
    # for url in prov_urls
    
    
    try:
        hor_df[new_col_name] = hor_df[ref_col].apply(extract_country)
        hor_df.to_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v5/assigned/cleaned/{dir_name}/{index +1}_{safe_sat_name}_channels.csv', index=False)
    except:
        print('problem yo')
        hor_df.to_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v5/assigned/cleaned/{dir_name}/{index +1}_{safe_sat_name}_channels.csv', index=False)
    # print(x_df.head())
        # table = get_country(url)
        # column_data = extract_rows_from_tables(table)
        # # Flattens the list. Now, we have a list of tr tags
        # column_data = flatten_comprehension(column_data)
        # col = extract_raw_data(column_data)
        # if(len(col) > 1):
        #     country = col[0][0].split('\n')[-1]
        #     new_df.loc[len(new_df)] = [safe_sat_name, provider ,country]


1 Processing sat: NSS-9
url =>  nan
url =>  https://www.lyngsat.com/providers/us/American-Forces-Network.html
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
2 Processing sat: Intelsat-18
url =>  https://www.lyngsat.com/providers/us/American-Forces-Network.html
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  https://www.lyngsat.com/providers/us/Anuvu.html
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  nan
url =>  na

In [763]:
x_df
# new_df.to_csv(f'E:/AUC/23-24/Spring/Database/Project/CSV Files/providers/v4/assigned/cleaned/prov_country/prov_with_country_from_url.csv', index=False)

1     us
2     us
3     us
4     us
5     us
6     us
7     us
8     us
9     us
10    us
11    us
12    us
13    us
14    us
15    us
16    us
Name: Channel_URL, dtype: object

In [None]:

new_df = pd.DataFrame(columns=['Satellite', 'Provider', 'Country'])

prov_urls = my_df['Provider_URL'].dropna().unique()

# prov_urls = ['https://www.lyngsat.com/tvchannels/us/AFN-Prime-Atlantic.html']
for url in prov_urls:
    table = get_country(url)
    column_data = extract_rows_from_tables(table)

    # Flattens the list. Now, we have a list of tr tags
    column_data = flatten_comprehension(column_data)
    col = extract_raw_data(column_data)
    if(len(col) > 1):
        provider = my_df[my_df['Provider_URL'] == url].iloc[0]['Provider/Channel']
        country = col[0][0].split('\n')[-1]
        new_df.loc[len(new_df)] = [safe_sat_name, provider ,country]

In [682]:
new_df

Unnamed: 0,Sattelite,Provider,Country
0,Intelsat-18,American Forces Network,United States
1,Intelsat-18,Anuvu,United States
2,Intelsat-18,France TV,France


'United States'