### Importing Pandas and Display.

In [None]:

import pandas as pd
from IPython.display import display 

### The next 3 blocks run the CSV file and then sorts it the same way.

In [None]:
df_code = pd.read_csv('Module6_claims copy/clean/example_2/STONYBRK_20240531_CODE.csv')

# 1. Shape (rows, columns)
print("Shape:", df_code.shape)

# 2. First 5 rows
print("\nFirst 5 rows:")
display(df_code.head())

# 3. Column names and data types
print("\nColumn names and data types:")
print(df_code.dtypes)

# 4. Missing value counts per column
print("\nMissing values per column:")
print(df_code.isnull().sum())

# 5. Basic descriptive statistics (numeric columns only)
print("\nDescriptive statistics for numeric columns:")
display(df_code.describe())


In [None]:
df_header =pd.read_csv('Module6_claims copy/clean/example_2/STONYBRK_20240531_HEADER.csv')

# 1. Shape (rows, columns)
print("Shape:", df_header.shape)

# 2. First 5 rows
print("\nFirst 5 rows:")
display(df_header.head())

# 3. Column names and data types
print("\nColumn names and data types:")
print(df_header.dtypes)

# 4. Missing value counts per column
print("\nMissing values per column:")
print(df_header.isnull().sum())

# 5. Basic descriptive statistics (numeric columns only)
print("\nDescriptive statistics for numeric columns:")
display(df_header.describe())

In [None]:
df_line = pd.read_csv('Module6_claims copy/clean/example_2/STONYBRK_20240531_LINE.csv')

# 1. Shape (rows, columns)
print("Shape:", df_line.shape)

# 2. First 5 rows
print("\nFirst 5 rows:")
display(df_line.head())

# 3. Column names and data types
print("\nColumn names and data types:")
print(df_line.dtypes)

# 4. Missing value counts per column
print("\nMissing values per column:")
print(df_line.isnull().sum())

# 5. Basic descriptive statistics (numeric columns only)
print("\nDescriptive statistics for numeric columns:")
display(df_line.describe())

### This just let me see what the column headings were. Need to run to run the rest of the code

In [None]:
import pandas as pd

paths = ["Module6_claims copy/clean/example_2/STONYBRK_20240531_CODE.csv", "Module6_claims copy/clean/example_2/STONYBRK_20240531_HEADER.csv", "Module6_claims copy/clean/example_2/STONYBRK_20240531_LINE.csv"]
dfs = [pd.read_csv(p) for p in paths]

for i, df in enumerate(dfs, start=1):
    print(f"\n=== Dataset {i} ===")
    print(df.shape)
    print(df.columns)

### Billing Providers by Name
## This code block takes information from Dataset 2 and takes information from the columns "BillingProvFirstName" and "BillingProvLastName" to combine them into the new column "BillingProvName". Then it groups it by BillingProvName and BillingProvNPI to sort it by claim counts. 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df2 = dfs[1]   # dataset 2

# Create a full provider name column
df2["BillingProviderName"] = (
    df2["BillingProvFirstName"].fillna("") + " " +
    df2["BillingProvLastName"].fillna("")
).str.strip()

# Group by Billing Provider Name + NPI
top5 = (
    df2.groupby(["BillingProviderName", "BillingProviderNPI"])
       .size()
       .reset_index(name="claim_count")
       .sort_values("claim_count", ascending=False)
       .head(5)
)

print(top5)

### Bar Chart of Top 5 Billing Providers

In [None]:
plt.figure(figsize=(10,5))

labels = (
    top5["BillingProviderName"] 
    + "\nNPI: " 
    + top5["BillingProviderNPI"].astype(str)
)

plt.bar(labels, top5["claim_count"], color="skyblue")

plt.xticks(rotation=45, ha="right")
plt.ylabel("Number of Claims")
plt.title("Top 5 Billing Providers by Claim Count")
plt.tight_layout()
plt.show()

### Top 5 Primary Payers by Claim Volume
## This uses df2 = dfs[1] or df2 = Dataframe 2. It fiters through the headers until it reachers "PrimaryPayerName" and then sorts it my number of claims from greatest to least. Then it outputs the top 5.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df2 = dfs[1]   

# Count claims by primary payer
payer_counts = (
    df2["PrimaryPayerName"]
    .value_counts()
    .reset_index()
)

payer_counts.columns = ["PrimaryPayerName", "claim_count"]


top5_payers = payer_counts.head(5)
print(top5_payers)

### Percentage of Total Claim

## Takes the claim count calcualted previously and divides it by the total number of claims and multiplties that number by 100. Then it rounds to the second decimal place. 

In [None]:
total_claims = payer_counts["claim_count"].sum()

top5_payers["percentage"] = (
    top5_payers["claim_count"] / total_claims * 100
).round(2)

print(top5_payers)


### Pie Chart showing Payer Distribution

In [None]:
plt.figure(figsize=(8,8))
plt.pie(
    top5_payers["claim_count"],
    labels=top5_payers["PrimaryPayerName"],
    autopct="%1.1f%%",
    startangle=90
)

plt.title("Primary Payer Distribution (Top 5)")
plt.show()

### Most Frequent Code Distributions


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df1 = dfs[0]   

