# Pandas Selection

In [None]:
import pandas as pd

### Step 2. This is the data given as a dictionary

In [None]:
# Create an example dataframe about a fictional army
raw_data = {
    "regiment": [
        "Nighthawks",
        "Nighthawks",
        "Nighthawks",
        "Nighthawks",
        "Dragoons",
        "Dragoons",
        "Dragoons",
        "Dragoons",
        "Scouts",
        "Scouts",
        "Scouts",
        "Scouts",
    ],
    "company": [
        "1st",
        "1st",
        "2nd",
        "2nd",
        "1st",
        "1st",
        "2nd",
        "2nd",
        "1st",
        "1st",
        "2nd",
        "2nd",
    ],
    "deaths": [523, 52, 25, 616, 43, 234, 523, 62, 62, 73, 37, 35],
    "battles": [5, 42, 2, 2, 4, 7, 8, 3, 4, 7, 8, 9],
    "size": [1045, 957, 1099, 1400, 1592, 1006, 987, 849, 973, 1005, 1099, 1523],
    "veterans": [1, 5, 62, 26, 73, 37, 949, 48, 48, 435, 63, 345],
    "readiness": [1, 2, 3, 3, 2, 1, 2, 3, 2, 1, 2, 3],
    "armored": [1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1],
    "deserters": [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
    "origin": [
        "Arizona",
        "California",
        "Texas",
        "Florida",
        "Maine",
        "Iowa",
        "Alaska",
        "Washington",
        "Oregon",
        "Wyoming",
        "Louisana",
        "Georgia",
    ],
}

### Step 3. Create a dataframe and assign it to a variable called army. 

#### Don't forget to include the columns names in the order presented in the dictionary ('regiment', 'company', 'deaths'...) so that the column index order is consistent with the solutions. If omitted, pandas will order the columns alphabetically.

In [None]:
army = pd.DataFrame(raw_data)

### Step 4. Set the 'origin' colum as the index of the dataframe

In [None]:
army.set_index("origin", inplace=True)

### Step 5. Print only the column veterans

In [None]:
army.veterans

### Step 6. Print the columns 'veterans' and 'deaths'

In [None]:
army[["veterans", "deaths"]]

### Step 7. Print the name of all the columns.

In [None]:
army.index

### Step 8. Select the 'deaths', 'size' and 'deserters' columns from Maine and Alaska

In [None]:
army.loc[["Maine", "Alaska"], ["deaths", "size", "deserters"]]

### Step 9. Select the rows 3 to 7 and the columns 3 to 6

In [None]:
army.iloc[3:7, 3:6]

### Step 10. Select every row after the fourth row and all columns

In [None]:
army.iloc[:4]

### Step 11. Select every row up to the 4th row and all columns

In [None]:
army.iloc[4:]

### Step 12. Select the 3rd column up to the 7th column

In [None]:
army.iloc[:, 3:7]

### Step 13. Select rows where df.deaths is greater than 50

In [None]:
army[army.deaths > 50]

### Step 14. Select rows where df.deaths is greater than 500 or less than 50

In [None]:
army[(army.deaths > 500) | (army.deaths < 50)]

### Step 15. Select all the regiments not named "Dragoons"

In [None]:
army[army.regiment != "Dragoons"]

### Step 16. Select the rows called Texas and Arizona

In [None]:
army.loc[["Texas", "Arizona"]]

### Step 17. Select the third cell in the row named Arizona

In [None]:
army.loc["Arizona"][3]

In [None]:
army.loc["Arizona"].iloc[3]

### Step 18. Select the third cell down in the column named deaths

In [None]:
army.deaths[-3]

In [None]:
army.deaths.iloc[-3]

In [None]:
army

## Sankey diagram
Let's look at the documentation to make a Sankey Diagram

<https://plotly.com/python/sankey-diagram/>

In [None]:
import plotly.graph_objects as go


In [None]:
# Prepare data for Sankey diagram
# We need to create a flow from origin -> regiment -> company

# First, let's look at the data structure
print("Data shape:", army.shape)
print("\nUnique values:")
print("Origins:", army.index.unique())
print("Regiments:", army.regiment.unique())
print("Companies:", army.company.unique())


In [None]:
# Create the Sankey diagram data
# We'll create flows: origin -> regiment -> company

# Get all unique values for each level
origins = army.index.unique().tolist()
regiments = army.regiment.unique().tolist()
companies = army.company.unique().tolist()

# Create node labels (all unique values)
all_nodes = origins + regiments + companies

# Create node indices
node_indices = {node: i for i, node in enumerate(all_nodes)}

# Create source and target lists for the flows
sources = []
targets = []
values = []

# Flow 1: Origin -> Regiment
for origin in origins:
    origin_idx = node_indices[origin]
    for regiment in regiments:
        regiment_idx = node_indices[regiment]
        # Sum the army size from this origin in this regiment
        size_sum = army[(army.index == origin) & (army.regiment == regiment)][
            "size"
        ].sum()
        if size_sum > 0:
            sources.append(origin_idx)
            targets.append(regiment_idx)
            values.append(size_sum)

# Flow 2: Regiment -> Company
for regiment in regiments:
    regiment_idx = node_indices[regiment]
    for company in companies:
        company_idx = node_indices[company]
        # Sum the army size from this regiment in this company
        size_sum = army[(army.regiment == regiment) & (army.company == company)][
            "size"
        ].sum()
        if size_sum > 0:
            sources.append(regiment_idx)
            targets.append(company_idx)
            values.append(size_sum)

print(f"Number of nodes: {len(all_nodes)}")
print(f"Number of flows: {len(sources)}")
print(f"Total flow value: {sum(values)}")


In [None]:
# Create the Sankey diagram
fig = go.Figure(
    data=[
        go.Sankey(
            node=dict(
                pad=15,
                thickness=20,
                line=dict(color="black", width=0.5),
                label=all_nodes,
                color="lightblue",
            ),
            link=dict(
                source=sources, target=targets, value=values, color="rgba(0,0,255,0.2)"
            ),
        )
    ]
)

fig.update_layout(
    title_text="Army Distribution by Size: Origin → Regiment → Company",
    font_size=12,
    width=1000,
    height=600,
)

fig.show()
