In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

#Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load the dataset
file_path = "CPSData.csv"  # Update the path if needed
data = pd.read_csv(file_path)

# Display basic information about the dataset
print("Dataset Overview:")
print(data.info())

In [None]:
# Exploratory Visualization Example 1: Age Distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=data, x='Age', bins=30, kde=True)
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")


In [None]:
# Display the first few rows of the dataset
print("\nFirst Few Rows:")
print(data.head())

In [None]:
# Removed MetroAreaCode column
## Dropped because we did not use it and did not have the code dictionary to determine area code and make it usable
data = data.drop(columns=['MetroAreaCode'])

In [None]:
#Check to see MetroAreaCode column is removed
print(data.columns)

In [None]:
# Remove the rows where age is less than 15, as the legal working age in some states starts at 15
##Anything else may be self-report error and not accurate
data = data[data['Age'] >= 15]

In [None]:
# Group the data by household size and education level
groupedData = data.groupby(['PeopleInHousehold', 'Education']).size().reset_index(name='Count')

In [None]:
# Pivot the data for plotting
pivotData = groupedData.pivot(index='PeopleInHousehold', columns='Education', values='Count').fillna(0)

In [None]:
# Apply filters and modifications
filteredData = (
    data[data['Education'] != 'Professional degree']  # Exclude rows with 'Professional degree'
    .query('PeopleInHousehold < 8')                   # Include rows with household size less than 8
)

# Update the 'Race' column to "Hispanic" wherever the 'Hispanic' column is 1
filteredData.loc[filteredData['Hispanic'] == 1, 'Race'] = 'Hispanic'

# Drop the 'Hispanic' column
filteredData = filteredData.drop(columns=['Hispanic'])

# Display the resulting filtered data
filteredData.head()

In [None]:
#Normalizing data
normalize = False  # Set to True for proportions or False for raw numbers

# Normalize the data if needed
if normalize:
    pivotData = pivotData.div(pivotData.sum(axis=1), axis=0)

In [None]:
# Define custom colors for each education level
customColors = ['#0c3547', '#10656d', '#598f91', '#93b071', '#ede2cc', '#edae93', '#dd6670','#eeb3b8']

In [None]:
#Visualization: Age Distribution of Sample
plt.figure(figsize=(10, 6))
sns.histplot(data=filteredData, x='Age', bins=30, kde=True)
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")

# Save the chart as a PNG file
plt.savefig("age_distribution.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
###Question 1: Demographic Insight
#How do demographic factors (e.g., age, race, education level, household size) vary across regions, and what implications does this have for employment and socioeconomic outcomes?
##Focus: Understanding regional diversity and identifying disparities or opportunities in employment.
##Visualizations: Bar charts, pie charts, or histograms showing the distribution of individuals by region, state, age, and race.

In [None]:
#Employment Status by Region
plt.figure(figsize=(12, 8))
sns.countplot(data=filteredData, x='EmploymentStatus', hue='Region')
plt.title("Employment Status by Region")
plt.xlabel("Employment Status")
plt.ylabel("Count")
plt.legend(title="Region")
plt.xticks(rotation=45)

# Save the chart as a PNG file
plt.savefig("employment_by_region.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
# Pie Chart: Breakdown by Region with Different Colors
region_counts = filteredData['Region'].value_counts()

# Define a color palette (one color per slice)
colors = plt.cm.Paired(range(len(region_counts)))

# Calculate total population
total_population = region_counts.sum()

# Function to display percentages and absolute numbers
def autopct_format(pct, all_vals):
    absolute = int(round(pct / 100. * sum(all_vals)))
    return f"{pct:.1f}%\n({absolute})"

# Plot the pie chart
plt.figure(figsize=(8, 8))
plt.pie(
    region_counts,
    labels=region_counts.index,
    autopct=lambda pct: autopct_format(pct, region_counts),
    startangle=140,
    colors=colors  # Use the defined color palette
)
plt.title("Breakdown by Region")
plt.text(
    0, -1.2,  # Position of the subtitle
    f"Total Population: {total_population}",  # Subtitle text
    ha='center',  # Horizontal alignment
    va='center',  # Vertical alignment
    fontsize=12
)

# Save the chart as a PNG file
plt.savefig("distribution_by_region.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
#Visualization of Education Levels by State
# Check for missing values in State and Education columns and drop them if necessary
education_state_data = (
    data.dropna(subset=['State', 'Education'])
    .groupby(['State', 'Education'])
    .size()
    .unstack(fill_value=0)
)

# Plot the stacked bar chart
education_state_data.plot(
    kind='bar',
    stacked=True,
    figsize=(16, 10),
    colormap='tab10'
)
plt.title("Education Levels by State")
plt.xlabel("State")
plt.ylabel("Count")
plt.legend(title="Education Level", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=90)
plt.tight_layout()

# Save the chart as a PNG file
plt.savefig("education_by_state.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
# Group by Gender and EmploymentStatus
employment_gender = filteredData.groupby(['Sex', 'EmploymentStatus']).size().unstack(fill_value=0)

# Calculate percentage distribution for each gender
employment_gender_pct = employment_gender.div(employment_gender.sum(axis=1), axis=0) * 100

# Plot stacked bar chart
employment_gender_pct.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Employment Status by Gender', fontsize=16)
plt.xlabel('Sex', fontsize=12)
plt.ylabel('Percentage (%)', fontsize=12)
plt.legend(title='Employment Status', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

# Save the chart as a PNG file
plt.savefig("employment_by_gender.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()


In [None]:
###Question 2: Employment Status and Trends
#What are the key employment patterns across different demographic groups (e.g., gender, age, education level, marital status)?
##Focus: Identify differences in employment status and education level,  based on gender, age, and marital status.
##Potential Insights: Identify employment barriers or disparities among different demographic groups (e.g., gender gaps, education-driven employment outcomes).
##Visualizations: Stacked bar charts, cor heatmaps showing patterns of employment, unemployment, and participation across age groups, education levels, and gender.

In [None]:
# Employment Status by Age Group
age_data = data.groupby(['Age','EmploymentStatus']).size().unstack().fillna(0)

In [None]:
# Visualization/Employment Status by Age Group: This bar chart provides a clear visual representation of how employment status varies by age, allowing you to identify trends, disparities, and insights related to employment and participation in the labor market across different age demographics.
# Using Stacked bar chart show patterns of employment, unemployment, and participation across age groups
age_data.plot(kind='bar', stacked=True, figsize=(14, 8))
plt.title('Employment Status and Trends Across Age Groups')
plt.xlabel('Age')
plt.ylabel('Number of Individuals')
plt.legend(title='Employment Status')
plt.xticks(rotation=45)
plt.tight_layout()

# Save the chart as a PNG file
plt.savefig("employment_by_age.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
#Employment Status by Gender
gender_data = data.groupby(['EmploymentStatus', 'Married']).size().unstack().fillna(0)

In [None]:
#Visualization/Employment Status by Marrital Status: The stacked bar chart provides a visual comparison of employment, unemployment, 
#and participation rates across various marital status categories, such as single, married, divorced, and widowed"""  
#Using Stacked bar chart show patterns of employment, unemployment, and participation across gender groups
gender_data.plot(kind='bar', stacked=True,figsize=(12, 6))
plt.title('Employment Status and Trends by Marital Status')
plt.xlabel('Gender')
plt.ylabel('Number of Individuals')
plt.legend(title='Employment Status', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=0)

# Save the chart as a PNG file
plt.savefig("employment_by_marital.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
###Question 3: Employment Status and Trends
#What are the key employment patterns across different demographic groups (e.g., gender, age, education level, marital status)?
##Focus: Identify differences in employment status and education level,  based on gender, age, and marital status.
##Potential Insights: Identify employment barriers or disparities among different demographic groups (e.g., gender gaps, education-driven employment outcomes).
##Visualizations: Stacked bar charts, cor heatmaps showing patterns of employment, unemployment, and participation across age groups, education levels, and gender.


In [None]:
# Plot the segmented bar chart
fig, ax = plt.subplots(figsize=(14, 8))
bar_width = 0.8 / len(pivotData.columns)  # Divide bar width by the number of categories
x = np.arange(len(pivotData.index))  # X positions for household sizes

for i, col in enumerate(pivotData.columns):
    ax.bar(
        x + i * bar_width,  # Offset bars for each education level
        pivotData[col],
        bar_width,
        label=col,
        color=customColors[i % len(customColors)]  # Assign colors
    )

# Customize the plot
ax.set_title('Household Size by Education Level', fontsize=16)
ax.set_xlabel('Household Size', fontsize=12)
ax.set_ylabel('Number of People', fontsize=12)
ax.set_xticks(x + bar_width * (len(pivotData.columns) - 1) / 2)  # Center tick labels
ax.set_xticklabels(pivotData.index, fontsize=10)
ax.legend(title='Education Level', fontsize=10, loc='upper left', bbox_to_anchor=(1.05, 1))
ax.grid(axis='y', linestyle='--', alpha=0.7)

# Display the plot
plt.tight_layout()
plt.savefig('HouseholdSizebyEducationLevel.png', dpi=300, bbox_inches='tight')  # Save with desired filename and quality
plt.show()

In [None]:
# Step 2: Group Data by Region and Race
grouped_data = filteredData.groupby(['Region', 'Race']).agg(
    AvgHouseholdSize=('PeopleInHousehold', 'mean'),
    MedianHouseholdSize=('PeopleInHousehold', 'median'),
    Count=('PeopleInHousehold', 'size')
).reset_index()

# Display the grouped data
grouped_data.head()

In [None]:
# Define custom colors for each region
regionColors = {
    'Midwest': '#02a5ff',
    'South': '#ff3300',
    'Northeast': '#66cc33',
    'West': '#ffcc00'
}
#Create the scatter plot
plt.figure(figsize=(12, 6))
for region in grouped_data['Region'].unique():
    region_data = grouped_data[grouped_data['Region'] == region]
    plt.scatter(
        region_data['Race'], 
        region_data['AvgHouseholdSize'], 
        label=region, 
        color=regionColors.get(region), 
        alpha=0.7
    )

plt.title('Average Household Size by Race and Region')
plt.xlabel('Race')
plt.ylabel('Average Household Size')
plt.legend(title='Region')
plt.xticks(rotation=45)
plt.tight_layout()

# Save the chart as a PNG file
plt.savefig("avghouse_by_race_region.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
###Question 4: Industry Representation and Employment 
#What industries dominate employment across different demographic groups, and which groups are underrepresented in certain industries?
##Focus: Highlight industries where specific demographic groups are over or underrepresented.
##Visualizations: Industry participation heatmaps or box plots illustrating demographic representation across industries.

In [None]:
# Industry Representation and Employment Analysis
# Creating a DataFrame grouped by demographic groups and industries
# Creating age groups
def age_group(age):
    if age < 20:
        return 'Below 20'
    elif 20 <= age < 30:
        return '20-29'
    elif 30 <= age < 40:
        return '30-39'
    elif 40 <= age < 50:
        return '40-49'
    elif 50 <= age < 60:
        return '50-59'
    else:
        return '60 and above'

# Adding AgeGroup column to the DataFrame
data['AgeGroup'] = data['Age'].apply(age_group)
industry_representation = data.groupby(['Industry', 'Sex', 'Race', 'AgeGroup']).size().reset_index(name='Count')

In [None]:
# Visualization 1: Industry Representation by Gender using a Bar Chart
# Explanation 1: This bar chart helps us understand which industries are dominated by males or females, highlighting over or underrepresentation.

plt.figurefigsize=(12, 6)
gender_counts = data.groupby(['Industry', 'Sex']).size().unstack().fillna(0)
gender_counts.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Industry Representation by Gender')
plt.xlabel('Industry')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right', fontsize=9)
plt.tight_layout()

# Save the chart as a PNG file
plt.savefig("Distribution_by_region.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
# Visualization 2: Industry Representation by Race using a Bar Chart
# Explanation 2: This bar chart helps us visualize representation across different races in various industries, helping identify racial disparities.
plt.figure(figsize=(14, 8))
race_counts = data.groupby(['Industry', 'Race']).size().unstack().fillna(0)
race_counts.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Industry Representation by Race')
plt.xlabel('Industry')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right', fontsize=9)
plt.tight_layout()

# Save the chart as a PNG file
plt.savefig("industry_by_race.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
# Visualization 3: Box Plot of Industry Representation by Age Group using Matplotlib
plt.figure(figsize=(14, 8))
for industry in data['Industry'].unique():
    subset = data[data['Industry'] == industry]
    plt.boxplot(subset['Age'], positions=[list(data['Industry'].unique()).index(industry)], widths=0.6)
plt.xticks(range(len(data['Industry'].unique())), data['Industry'].unique(), rotation=45, ha='right', fontsize=9)
plt.xlabel('Industry')
plt.ylabel('Age')
plt.title('Age Distribution by Industry')
plt.tight_layout()

# Save the chart as a PNG file
plt.savefig("age_by_industry.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
# Create age bins
bins = [0, 18, 30, 45, 60, 100]
labels = ['<18', '18-30', '31-45', '46-60', '60+']
data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False)

# Group by demographics and employment status
grouped = data.groupby(['Sex', 'Education', 'Married', 'AgeGroup', 'EmploymentStatus']).size().unstack(fill_value=0)

# Calculate percentages within groups
group_percentages = grouped.div(grouped.sum(axis=1), axis=0) * 100


In [None]:
# Calculate percentages within groups
group_percentages = grouped.div(grouped.sum(axis=1), axis=0) * 100

group_percentages.head()  # Preview the grouped data

In [None]:
# Aggregate by education and employment status
grouped_by_education = data.groupby(['Education', 'EmploymentStatus']).size().unstack(fill_value=0)

# Calculate percentages
education_percentages = grouped_by_education.div(grouped_by_education.sum(axis=1), axis=0) * 100

# Visualize as a stacked bar chart
import matplotlib.pyplot as plt

education_percentages.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='viridis')
plt.title("Employment Status by Education Level")
plt.ylabel("Percentage")
plt.xlabel("Education Level")
plt.legend(title="Employment Status", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()

# Save the chart as a PNG file
plt.savefig("employment_by_education.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
### Question 5: Barriers and Opportunities
#Are there correlations between household size and higher education attainment, especially in states with larger households?
##Focus: Investigate if larger households correlate with lower levels of higher education attainment and the systemic or cultural factors that may influence this trend
##Potential Insights: Understand how educational opportunities and systemic access impact family dynamics and educational outcomes.

In [None]:
# Group data by Industry and Race, and calculate counts
industry_representation = filteredData.groupby(['Industry', 'Race']).size().unstack(fill_value=0)

# Calculate percentage representation within each industry
industry_representation_pct = industry_representation.div(industry_representation.sum(axis=1), axis=0) * 100

# Display representation percentages
industry_representation_pct.head()

In [None]:
# Group by Citizenship and Education, and calculate counts
education_citizenship = filteredData.groupby(['Citizenship', 'Education']).size().unstack(fill_value=0)

# Calculate percentage distribution within each citizenship group
education_citizenship_pct = education_citizenship.div(education_citizenship.sum(axis=1), axis=0) * 100

# Display grouped data
print(education_citizenship_pct)

# Plot stacked bar chart
education_citizenship_pct.plot(kind='bar', figsize=(12, 6), stacked=True)

# Add titles and labels
plt.title('Education Distribution by Citizenship', fontsize=16)
plt.xlabel('Citizenship', fontsize=12)
plt.ylabel('Percentage (%)', fontsize=12)
plt.legend(title='Education Level', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

# Save the chart as a PNG file
plt.savefig("education_by_citizenship.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
# Melt data for plotting
industry_melted = industry_representation_pct.reset_index().melt(
    id_vars='Industry', var_name='Demographic', value_name='Representation'
)

# Group data by demographics for box plot
demographics = industry_melted['Demographic'].unique()
data = [industry_melted[industry_melted['Demographic'] == demo]['Representation'] for demo in demographics]

# Plot box plot
plt.figure(figsize=(12, 6))
plt.boxplot(data, labels=demographics, patch_artist=True, boxprops=dict(facecolor="#66b3ff"))

# Add labels
plt.title('Representation Variability Across Industries by Demographics', fontsize=16)
plt.xlabel('Demographics', fontsize=12)
plt.ylabel('Representation (%)', fontsize=12)
plt.xticks(rotation=45)

# Save the chart as a PNG file
plt.savefig("representation_across_industry.png", dpi=300, bbox_inches='tight')

# Display the chart
plt.show()

In [None]:
import requests
import pandas as pd
import matplotlib.pyplot as plt

# Your Census API Key
api_key = '0a6b60135136d52965b7021241f7cfae54db1d4c'

# Base URLs for 2009, 2013, 2015, 2020 ACS 5-Year Data
base_urls = {
    2013: 'https://api.census.gov/data/2013/acs/acs5',
    2015: 'https://api.census.gov/data/2015/acs/acs5',
    2020: 'https://api.census.gov/data/2020/acs/acs5',
    2021: 'https://api.census.gov/data/2021/acs/acs5',
}

# Variables for different degrees
degree_variables = {
    'High School': 'B15002_005E',  # High school graduate
    'Doctorate': 'B15003_022E',  # Doctorate degree
    'Masters': 'B15003_021E',    # Master's degree
    'Bachelors': 'B15003_020E'   # Bachelor's degree
}

# Initialize list to store data for all degrees
all_data = []

# Loop through degrees and years to fetch data
for degree, variable in degree_variables.items():
    for year, base_url in base_urls.items():
        query_url = f'{base_url}?get={variable},NAME&for=state:*&key={api_key}'
        response = requests.get(query_url)

        if response.status_code == 200:
            # Convert to DataFrame
            data = response.json()
            columns = ['Count', 'State', 'StateFIPS']
            state_data = pd.DataFrame(data[1:], columns=columns)

            # Convert numeric column to integer
            state_data['Count'] = pd.to_numeric(state_data['Count'], errors='coerce')

            # Add Degree and Year columns
            state_data['Degree'] = degree
            state_data['Year'] = year

            # Group by Year and Degree to sum counts
            degree_year_data = state_data.groupby(['Degree', 'Year'])['Count'].sum().reset_index()

            # Append to the collected data
            all_data.append(degree_year_data)
        else:
            print(f"API Error for {degree} in {year}: {response.status_code}")
            print(response.text)

# Combine all data into one DataFrame
combined_data = pd.concat(all_data)

import matplotlib.pyplot as plt

# Aggregate data across all regions (if not already done)
aggregated_data = combined_data.groupby(['Degree', 'Year'])['Count'].sum().reset_index()

# Pivot data for grouped bar chart
bar_data = aggregated_data.pivot(index='Year', columns='Degree', values='Count')

# Plot grouped bar chart
bar_data.plot(kind='bar', figsize=(10, 6), alpha=0.8)

# Enhance visualization
plt.title("Educational Attainment by Degree Over Time", fontsize=16)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Count of Degree Holders", fontsize=12)
plt.xticks(rotation=0)  # Keeps year labels horizontal for better readability
plt.legend(title="Degree Type", fontsize=10)
plt.tight_layout()

# Show plot
plt.savefig('EducationalAttainmentbyDegreeOverTime.png', dpi=300, bbox_inches='tight')  # Save with desired filename and quality
plt.show()