# Name: Balakrishna Mupparaju
# Assignment: Project Milestone 3

In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from fuzzywuzzy import process
import warnings
warnings.filterwarnings("ignore")
# ===============================
# Scrape the Wikipedia S&P 500 data directly from the HTML source.
wiki_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
response = requests.get(wiki_url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', {'id': 'constituents'})
rows = table.find_all('tr')




In [6]:

# Gather data from the table rows (skip the header row)
tickers = []
companies = []
sectors = []
industries = []
date_added = []  # Date on which the company was added to the index

for row in rows[1:]:  # Skipping header row
    cols = row.find_all('td')
    tickers.append(cols[0].text.strip())
    companies.append(cols[1].text.strip())
    sectors.append(cols[3].text.strip())
    industries.append(cols[4].text.strip())
    date_added.append(cols[5].text.strip())

# Create DataFrame from the scraped data
wiki_data = pd.DataFrame({
    "Ticker": tickers,
    "Company_Name": companies,
    "Sector": sectors,
    "Industry": industries,
    "Date_Added": date_added
})


In [9]:

# ===============================
# Step #1: Replace Headers
# Description: Rename headers for clarity and consistency with other data sources.
wiki_data.rename(columns={
    "Sector": "Industry_Sector"  # Aligns with Kaggle and other datasets
}, inplace=True)
print("Step #1: Replaced Headers")
print(wiki_data.head(), "\n")


Step #1: Replaced Headers
  Ticker         Company_Name                 Industry_Sector  \
0    MMM                   3M        Industrial Conglomerates   
1    AOS          A. O. Smith               Building Products   
2    ABT  Abbott Laboratories           Health Care Equipment   
3   ABBV               AbbVie                   Biotechnology   
4    ACN            Accenture  IT Consulting & Other Services   

                  Industry  Date_Added  
0    Saint Paul, Minnesota  1957-03-04  
1     Milwaukee, Wisconsin  2017-07-26  
2  North Chicago, Illinois  1957-03-04  
3  North Chicago, Illinois  2012-12-31  
4          Dublin, Ireland  2011-07-06   



In [11]:

# ===============================
# Step #2: Fix Casing/Inconsistent Values
# Description: Convert text columns to Title Case to ensure consistency.
wiki_data['Company_Name'] = wiki_data['Company_Name'].str.title()
wiki_data['Industry_Sector'] = wiki_data['Industry_Sector'].str.title()
wiki_data['Industry'] = wiki_data['Industry'].str.title()
print("Step #2: Fixed casing in text columns")
print(wiki_data.head(), "\n")



Step #2: Fixed casing in text columns
  Ticker         Company_Name                 Industry_Sector  \
0    MMM                   3M        Industrial Conglomerates   
1    AOS          A. O. Smith               Building Products   
2    ABT  Abbott Laboratories           Health Care Equipment   
3   ABBV               Abbvie                   Biotechnology   
4    ACN            Accenture  It Consulting & Other Services   

                  Industry  Date_Added  
0    Saint Paul, Minnesota  1957-03-04  
1     Milwaukee, Wisconsin  2017-07-26  
2  North Chicago, Illinois  1957-03-04  
3  North Chicago, Illinois  2012-12-31  
4          Dublin, Ireland  2011-07-06   



In [13]:
# ===============================
# Step #3: Identify and Remove Duplicates
# Description: Remove duplicate entries by Ticker & Company_Name to ensure unique records.
num_duplicates = wiki_data.duplicated(subset=['Ticker', 'Company_Name']).sum()
print(f"Step #3: Found {num_duplicates} duplicate(s) before removal.")
wiki_data = wiki_data.drop_duplicates(subset=['Ticker', 'Company_Name'])
print("Duplicates removed.")
print(wiki_data.head(), "\n")



Step #3: Found 0 duplicate(s) before removal.
Duplicates removed.
  Ticker         Company_Name                 Industry_Sector  \
0    MMM                   3M        Industrial Conglomerates   
1    AOS          A. O. Smith               Building Products   
2    ABT  Abbott Laboratories           Health Care Equipment   
3   ABBV               Abbvie                   Biotechnology   
4    ACN            Accenture  It Consulting & Other Services   

                  Industry  Date_Added  
0    Saint Paul, Minnesota  1957-03-04  
1     Milwaukee, Wisconsin  2017-07-26  
2  North Chicago, Illinois  1957-03-04  
3  North Chicago, Illinois  2012-12-31  
4          Dublin, Ireland  2011-07-06   



In [15]:
# ===============================
# Step #4: Format Date Fields
# Description: Convert the 'Date_Added' column to datetime format for proper date operations.
wiki_data['Date_Added'] = pd.to_datetime(wiki_data['Date_Added'], errors='coerce')
print("Step #4: Converted 'Date_Added' to datetime format")
print(wiki_data[['Ticker', 'Date_Added']].head(), "\n")



Step #4: Converted 'Date_Added' to datetime format
  Ticker Date_Added
0    MMM 1957-03-04
1    AOS 2017-07-26
2    ABT 1957-03-04
3   ABBV 2012-12-31
4    ACN 2011-07-06 



In [17]:
# ===============================
# Step #5: Clean and Trim Text Fields / Remove Missing Critical Data
# Description: Trim extra whitespace in the 'Ticker' column, standardize it, and drop rows with missing critical values.
wiki_data['Ticker'] = wiki_data['Ticker'].str.strip().str.upper() 
wiki_data = wiki_data.dropna(subset=['Ticker'])
print("Step #5: Trimmed Ticker text and dropped rows with missing Ticker")
print(wiki_data.head(), "\n")



Step #5: Trimmed Ticker text and dropped rows with missing Ticker
  Ticker         Company_Name                 Industry_Sector  \
0    MMM                   3M        Industrial Conglomerates   
1    AOS          A. O. Smith               Building Products   
2    ABT  Abbott Laboratories           Health Care Equipment   
3   ABBV               Abbvie                   Biotechnology   
4    ACN            Accenture  It Consulting & Other Services   

                  Industry Date_Added  
0    Saint Paul, Minnesota 1957-03-04  
1     Milwaukee, Wisconsin 2017-07-26  
2  North Chicago, Illinois 1957-03-04  
3  North Chicago, Illinois 2012-12-31  
4          Dublin, Ireland 2011-07-06   



In [19]:
# ===============================
# Final Cleaned Dataset
print("Cleaned Website Dataset (Wikipedia) after all transformations:")
print(wiki_data.to_string(index=False))


Cleaned Website Dataset (Wikipedia) after all transformations:
Ticker                           Company_Name                                         Industry_Sector                                    Industry Date_Added
   MMM                                     3M                                Industrial Conglomerates                       Saint Paul, Minnesota 1957-03-04
   AOS                            A. O. Smith                                       Building Products                        Milwaukee, Wisconsin 2017-07-26
   ABT                    Abbott Laboratories                                   Health Care Equipment                     North Chicago, Illinois 1957-03-04
  ABBV                                 Abbvie                                           Biotechnology                     North Chicago, Illinois 2012-12-31
   ACN                              Accenture                          It Consulting & Other Services                             Dublin, Ireland 2011-0

# Ethical Implications
In this data wrangling process, I made several modifications to the raw website data: I replaced ambiguous headers for clarity, standardized textual values by enforcing title case, removed duplicate records to ensure uniqueness, converted date strings to proper datetime objects for accurate temporal analysis, and trimmed critical fields to eliminate extra spaces and missing values. Although the data—sourced directly from Wikipedia—is publicly available and generally reputable, legal and regulatory guidelines require proper attribution and compliance with Wikipedia’s terms of use. The primary risks in these transformations include potential mismatches during fuzzy matching or inadvertent removal of valid data if duplicates are misidentified; additionally, assumptions such as converting all text to title case may not perfectly represent all company names. The data was sourced by scraping an official and publicly accessible page, and its credibility was verified through cross-referencing with other industry sources where possible. Data was acquired ethically under the guidelines for public web scraping, and to mitigate ethical risks, I documented all transformation assumptions, applied conservative duplicate removal criteria, and ensured that no personally sensitive information was manipulated or exposed.

In [None]:
import requests
import pandas as pd

# API Endpoint for stock data
api_url = "https://your-api-endpoint.com/data"

# Fetch API data
response = requests.get(api_url)
data = response.json()

# Load JSON into DataFrame
api_df = pd.DataFrame(data)


In [None]:
api_df.rename(columns={
    "ticker": "Ticker",
    "company_name": "Company_Name",
    "sector": "Industry_Sector",
    "industry": "Industry",
    "date": "Date_Added"
}, inplace=True)

print("Step #2: Replaced headers for consistency.")


In [None]:
api_df["Company_Name"] = api_df["Company_Name"].str.title()
api_df["Industry_Sector"] = api_df["Industry_Sector"].str.title()
api_df["Industry"] = api_df["Industry"].str.title()

print("Step #3: Standardized text formatting.")


In [None]:
api_df["Date_Added"] = pd.to_datetime(api_df["Date_Added"], errors="coerce")

print("Step #4: Converted date format.")


In [None]:
# Select only relevant columns from API data
api_df = api_df[["Ticker", "Company_Name", "Industry_Sector", "Industry", "Date_Added"]]

# Merge API data with Wikipedia data (if needed)
# Ensure both sources are using a consistent structure
merged_df = api_df.copy()  # Placeholder for merging logic if needed

print("Step #5: Final API dataset synchronized with Wikipedia dataset.")
print(merged_df.head())
