In [1]:
#make sure you have the Selenium library installed. You can install it using pip: pip install selenium
#make sure to install pip install pymongo[srv]
from splinter import Browser
import time
import pandas as pd
from bs4 import BeautifulSoup
from pymongo import MongoClient
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from urllib.parse import quote_plus
from pymongo import MongoClient
from urllib.parse import quote_plus 

# Initialize browser
browser = Browser('chrome', executable_path='C:\\chromedriver_win32\\chromedriver.exe', headless=False)

# List of NFL teams
teams = ['arizona-cardinals', 'atlanta-falcons', 'baltimore-ravens', 'buffalo-bills', 
         'carolina-panthers', 'chicago-bears', 'cincinnati-bengals', 'cleveland-browns', 
         'dallas-cowboys', 'denver-broncos', 'detroit-lions', 'green-bay-packers', 
         'houston-texans', 'indianapolis-colts', 'jacksonville-jaguars', 'kansas-city-chiefs', 
         'las-vegas-raiders', 'los-angeles-chargers', 'los-angeles-rams', 'miami-dolphins', 
         'minnesota-vikings', 'new-england-patriots', 'new-orleans-saints', 'new-york-giants', 
         'new-york-jets', 'philadelphia-eagles', 'pittsburgh-steelers', 'san-francisco-49ers', 
         'seattle-seahawks', 'tampa-bay-buccaneers', 'tennessee-titans', 'washington-football-team']
all_data = []
for year in range(2020, 2024):
    for team in teams:
        # Visit the page
        url = f"https://www.spotrac.com/nfl/{team}/cap/{year}"
        browser.visit(url)

        # Create BeautifulSoup object; parse with 'html.parser'
        html = browser.html
        soup = BeautifulSoup(html, 'html.parser')

        # Find the correct table with Active Players
        all_tables = soup.find_all('table')
        active_table = None
        for table in all_tables:
            ths = table.find_all('th')
            for th in ths:
                if 'Active Players' in th.text:
                    active_table = table
                    break
            if active_table is not None:
                break

        # Retrieve all elements that contain player salary information
        players = active_table.find_all('tr')

        # Iterate through each player
        for player in players:
            # Use Beautiful Soup's find() method to navigate and retrieve attributes
            name_tag = player.find('td', class_='player')
            if name_tag is not None:
                a_tag = name_tag.find('a')
                if a_tag is not None:
                    name = a_tag.text
                    # Find all td tags within the player's row
                    all_tds = player.find_all('td')
                    if len(all_tds) >= 12:  # There should be at least 12 td tags if the player row is valid
                        position = all_tds[1].find('span').text
                        cap_hit = all_tds[2].find('span').text.strip()
                        base_salary = all_tds[3].find('span').text.strip()
                        cap_percentage = all_tds[11].text.strip()
                        
                        player_data = {
                            "year": year,
                            "team": team,
                            "name": name,
                            "position": position,
                            "cap_hit": cap_hit,
                            "base_salary": base_salary,
                            "cap_percentage": cap_percentage
                        }
                        all_data.append(player_data)

# Convert to a DataFrame and export to a csv file
df = pd.DataFrame(all_data)

# Reset the index to start from 1 and name the index column as "index"
df.index = df.index + 1
df.index.name = "index"


# Close the browser
browser.quit()



In [2]:

def parse_win_loss_record(record):
    # Split the win-loss-tie record and extract win, loss, and tie values
    parts = record.split('-')
    if len(parts) == 3:
        win, loss, tie = parts
        return int(win), int(loss), int(tie)
    else:
        return None, None, None

def scrape_win_loss_records():
    base_url = "https://www.teamrankings.com/nfl/trends/win_trends/"
    years = range(2010, 2023)

    # Initialize lists to store the extracted data
    years_list = []
    team_names = []
    wins = []
    losses = []
    ties = []

    # Start the ChromeDriver service
    service = Service('C:\\chromedriver_win32\\chromedriver.exe')  # Replace with the actual path to chromedriver
    service.start()

    # Create a new instance of the Chrome driver
    driver = webdriver.Chrome(service=service)

    # Open the URL in Chrome
    driver.get(base_url)

    # Loop through each year
    for year in years:
        # Find the dropdown element to select the year
        dropdown = driver.find_element_by_id('range')

        # Find the option corresponding to the current year
        option = dropdown.find_element_by_xpath(f"//option[@value='yearly_{year}']")
        option.click()

        # Wait for a short duration for the page to load
        time.sleep(2)

        # Get the page source after selecting the year
        page_source = driver.page_source

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(page_source, 'html.parser')

        # Find the table containing the win-loss records
        table = soup.find('table', class_='tr-table')

        if table:
            # Loop through each row in the table and extract the team name and win-loss record
            for row in table.find_all('tr'):
                columns = row.find_all('td')
                if len(columns) == 5:  # Ensure we have a valid row with data
                    team_name = columns[0].text.strip()
                    win_loss_record = columns[1].text.strip()

                    # Split win-loss-tie record into separate columns
                    win, loss, tie = parse_win_loss_record(win_loss_record)

                    years_list.append(year)
                    team_names.append(team_name)
                    wins.append(win)
                    losses.append(loss)
                    ties.append(tie)

            # Wait for a short duration before fetching data for the next year
            time.sleep(1)
        else:
            print(f"No data found for the year {year}")

    # Close the ChromeDriver
    driver.quit()

    # Combine data into a list of tuples
    data = list(zip(years_list, team_names, wins, losses, ties))

    # Create a Pandas DataFrame
    df_nfl_score = pd.DataFrame(data, columns=['Year', 'Team', 'Win', 'Loss', 'Tie'])
    # Return the DataFrame
    return df_nfl_score
    

    

# Example usage:
df_nfl_score = scrape_win_loss_records()
# Mapping of names from nfl_scores to nfl_salaries
team_name_mapping = {
    'Baltimore': 'baltimore-ravens',
    'San Francisco': 'san-francisco-49ers',
    'Kansas City': 'kansas-city-chiefs',
    'Green Bay': 'green-bay-packers',
    'New Orleans': 'new-orleans-saints',
    'New England': 'new-england-patriots',
    'Seattle': 'seattle-seahawks',
    'Minnesota': 'minnesota-vikings',
    'Houston': 'houston-texans',
    'Buffalo': 'buffalo-bills',
    'Tennessee': 'tennessee-titans',
    'LA Rams': 'los-angeles-rams',
    'Philadelphia': 'philadelphia-eagles',
    'Dallas': 'dallas-cowboys',
    'Pittsburgh': 'pittsburgh-steelers',
    'Chicago': 'chicago-bears',
    'Atlanta': 'atlanta-falcons',
    'Las Vegas': 'las-vegas-raiders',
    'Tampa Bay': 'tampa-bay-buccaneers',
    'Denver': 'denver-broncos',
    'NY Jets': 'new-york-jets',
    'Indianapolis': 'indianapolis-colts',
    'Cleveland': 'cleveland-browns',
    'Jacksonville': 'jacksonville-jaguars',
    'Arizona': 'arizona-cardinals',
    'LA Chargers': 'los-angeles-chargers',
    'Miami': 'miami-dolphins',
    'Carolina': 'carolina-panthers',
    'NY Giants': 'new-york-giants',
    'Detroit': 'detroit-lions',
    'Washington': 'washington-football-team',
    'Cincinnati': 'cincinnati-bengals',
}

# Replace team names in nfl_scores DataFrame
df_nfl_score['Team'] = df_nfl_score['Team'].map(team_name_mapping)



  dropdown = driver.find_element_by_id('range')
  option = dropdown.find_element_by_xpath(f"//option[@value='yearly_{year}']")


In [4]:


username = quote_plus("binhdole")
password = quote_plus("1Nnov@t1on")

# Connection
uri = f"mongodb+srv://{username}:{password}@cluster0.dupn17e.mongodb.net/mydatabase"
client = MongoClient(uri)
db = client.mydatabase

# Convert DataFrame to dict

data_nfl_scores = df_nfl_score.to_dict('records')

# Choose the collections

collection_scores = db['nfl_scores']

# Insert the data

collection_scores.insert_many(data_nfl_scores)

<pymongo.results.InsertManyResult at 0x2674a8ef600>

In [5]:
from pymongo import MongoClient
from urllib.parse import quote_plus
import json

# MongoDB connection
username = quote_plus("binhdole")
password = quote_plus("1Nnov@t1on")
uri = f"mongodb+srv://{username}:{password}@cluster0.dupn17e.mongodb.net/mydatabase"
client = MongoClient(uri)
db = client.mydatabase

# Read data from JSON file
with open(r'C:\Users\binhd\Documents\Sports_Project\data.json') as json_file:
    data_nfl_salaries = json.load(json_file)


# Choose the collection
collection_salaries = db['nfl_salaries']

# Insert the data
collection_salaries.insert_many(data_nfl_salaries)


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (384699840.py, line 13)