# Fictional Army - Filtering and Sorting

### Import libraries

In [1]:
import numpy as np
import pandas as pd

### Step 1. This is the data given as a dictionary. Create a dataframe and assign it to a variable called army. 

In [2]:
# Create an example dataframe about a fictional army
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
            'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
            'deaths': [523, 52, 25, 616, 43, 234, 523, 62, None, 73, 37, 35],
            'battles': [5, 42, 2, 2, 4, 7, 8, 3, 4, 7, 8, 9],
            'size': [1045, 957, 1099, 1400, 1592, 1006, 987, 849, 973, 1005, 1099, 1523],
            'veterans': [1, 5, 62, 26, 73, 37, 949, 48, 48, 435, 63, 345],
            'readiness': [1, None, 3, None, 2, None, 2, None, None, 1, 2, None],
            'armored': [1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1],
            'deserters': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
            'origin': ['Arizona', 'California', 'Texas', 'Florida', 'Maine', 'Iowa', 'Alaska', 'Washington', 'Oregon', 'Wyoming', 'Louisana', 'Georgia']}

In [3]:
army = pd.DataFrame(raw_data)
army

Unnamed: 0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters,origin
0,Nighthawks,1st,523.0,5,1045,1,1.0,1,4,Arizona
1,Nighthawks,1st,52.0,42,957,5,,0,24,California
2,Nighthawks,2nd,25.0,2,1099,62,3.0,1,31,Texas
3,Nighthawks,2nd,616.0,2,1400,26,,1,2,Florida
4,Dragoons,1st,43.0,4,1592,73,2.0,0,3,Maine
5,Dragoons,1st,234.0,7,1006,37,,1,4,Iowa
6,Dragoons,2nd,523.0,8,987,949,2.0,0,24,Alaska
7,Dragoons,2nd,62.0,3,849,48,,1,31,Washington
8,Scouts,1st,,4,973,48,,0,2,Oregon
9,Scouts,1st,73.0,7,1005,435,1.0,0,3,Wyoming


### Step 2. Check missing values

In [9]:
army["regiment"].isnull().sum()

0

In [5]:
army.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   regiment   12 non-null     object 
 1   company    12 non-null     object 
 2   deaths     11 non-null     float64
 3   battles    12 non-null     int64  
 4   size       12 non-null     int64  
 5   veterans   12 non-null     int64  
 6   readiness  6 non-null      float64
 7   armored    12 non-null     int64  
 8   deserters  12 non-null     int64  
 9   origin     12 non-null     object 
dtypes: float64(2), int64(5), object(3)
memory usage: 1.1+ KB


Vemos que hay unos pocos valores nulos en los features "readiness" y "deaths".

### Step 3. Drop features, if there are more than 30% missing values.

In [138]:
def percent_missing(df):
    percent_nan = 100 * df.isnull().sum() / len(df)
    percent_nan = percent_nan[percent_nan > 0].sort_values()

    return percent_nan

In [139]:
percent_missing(army)

deaths        8.333333
readiness    50.000000
dtype: float64

In [130]:
# Eliminamos los features con más de un 30% de missing values:

def remove_null_features(df, feature):
    return df.drop([feature], axis=1)

In [140]:
army = remove_null_features(army, "readiness")
army

Unnamed: 0,regiment,company,deaths,battles,size,veterans,armored,deserters,origin
0,Nighthawks,1st,523.0,5,1045,1,1,4,Arizona
1,Nighthawks,1st,52.0,42,957,5,0,24,California
2,Nighthawks,2nd,25.0,2,1099,62,1,31,Texas
3,Nighthawks,2nd,616.0,2,1400,26,1,2,Florida
4,Dragoons,1st,43.0,4,1592,73,0,3,Maine
5,Dragoons,1st,234.0,7,1006,37,1,4,Iowa
6,Dragoons,2nd,523.0,8,987,949,0,24,Alaska
7,Dragoons,2nd,62.0,3,849,48,1,31,Washington
8,Scouts,1st,,4,973,48,0,2,Oregon
9,Scouts,1st,73.0,7,1005,435,0,3,Wyoming


### Step 4. Fill missing values with the mean of their regiment.

In [None]:
army["deaths"] = army.groupby("regiment")["deaths"].transform(lambda val: val.fillna(val.mean()))
army

Unnamed: 0,regiment,company,deaths,battles,size,veterans,armored,deserters,origin
0,Nighthawks,1st,523.0,5,1045,1,1,4,Arizona
1,Nighthawks,1st,52.0,42,957,5,0,24,California
2,Nighthawks,2nd,25.0,2,1099,62,1,31,Texas
3,Nighthawks,2nd,616.0,2,1400,26,1,2,Florida
4,Dragoons,1st,43.0,4,1592,73,0,3,Maine
5,Dragoons,1st,234.0,7,1006,37,1,4,Iowa
6,Dragoons,2nd,523.0,8,987,949,0,24,Alaska
7,Dragoons,2nd,62.0,3,849,48,1,31,Washington
8,Scouts,1st,48.333333,4,973,48,0,2,Oregon
9,Scouts,1st,73.0,7,1005,435,0,3,Wyoming


In [143]:
# Comprobamos valores nulos:

percent_missing(army)

Series([], dtype: float64)

No quedan valores nulos.

### Step 5. Set the 'origin' colum as the index of the dataframe

In [15]:
army = army.set_index("origin")
army

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1.0,1,4
California,Nighthawks,1st,52.0,42,957,5,,0,24
Texas,Nighthawks,2nd,25.0,2,1099,62,3.0,1,31
Florida,Nighthawks,2nd,616.0,2,1400,26,,1,2
Maine,Dragoons,1st,43.0,4,1592,73,2.0,0,3
Iowa,Dragoons,1st,234.0,7,1006,37,,1,4
Alaska,Dragoons,2nd,523.0,8,987,949,2.0,0,24
Washington,Dragoons,2nd,62.0,3,849,48,,1,31
Oregon,Scouts,1st,,4,973,48,,0,2
Wyoming,Scouts,1st,73.0,7,1005,435,1.0,0,3


### Step 6. Select the 'deaths', 'size' and 'deserters' columns from Maine and Alaska

In [17]:
army.loc[["Maine", "Alaska"]][["deaths", "size", "deserters"]]

Unnamed: 0_level_0,deaths,size,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Maine,43.0,1592,3
Alaska,523.0,987,24


In [119]:
army

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1,4
California,Nighthawks,1st,52.0,42,957,5,0,24
Texas,Nighthawks,2nd,25.0,2,1099,62,1,31
Florida,Nighthawks,2nd,616.0,2,1400,26,1,2
Maine,Dragoons,1st,43.0,4,1592,73,0,3
Iowa,Dragoons,1st,234.0,7,1006,37,1,4
Alaska,Dragoons,2nd,523.0,8,987,949,0,24
Washington,Dragoons,2nd,62.0,3,849,48,1,31
Oregon,Scouts,1st,48.333333,4,973,48,0,2
Wyoming,Scouts,1st,73.0,7,1005,435,0,3


### Step 7. Select the rows 3 to 7 and the columns 3 to 6

In [147]:
army.iloc[2:7, 2:6]

Unnamed: 0_level_0,deaths,battles,size,veterans
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Texas,25.0,2,1099,62
Florida,616.0,2,1400,26
Maine,43.0,4,1592,73
Iowa,234.0,7,1006,37
Alaska,523.0,8,987,949


### Step 8. Select every row after the fourth row and all columns

In [149]:
army.iloc[3:, :]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Florida,Nighthawks,2nd,616.0,2,1400,26,1,2
Maine,Dragoons,1st,43.0,4,1592,73,0,3
Iowa,Dragoons,1st,234.0,7,1006,37,1,4
Alaska,Dragoons,2nd,523.0,8,987,949,0,24
Washington,Dragoons,2nd,62.0,3,849,48,1,31
Oregon,Scouts,1st,48.333333,4,973,48,0,2
Wyoming,Scouts,1st,73.0,7,1005,435,0,3
Louisana,Scouts,2nd,37.0,8,1099,63,1,2
Georgia,Scouts,2nd,35.0,9,1523,345,1,3


### Step 9. Select every row up to the 4th row and all columns

In [150]:
army.iloc[:4, :]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1,4
California,Nighthawks,1st,52.0,42,957,5,0,24
Texas,Nighthawks,2nd,25.0,2,1099,62,1,31
Florida,Nighthawks,2nd,616.0,2,1400,26,1,2


### Step 10. Select the 3rd column up to the 7th column

In [152]:
army.iloc[:, 2:7]

Unnamed: 0_level_0,deaths,battles,size,veterans,armored
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Arizona,523.0,5,1045,1,1
California,52.0,42,957,5,0
Texas,25.0,2,1099,62,1
Florida,616.0,2,1400,26,1
Maine,43.0,4,1592,73,0
Iowa,234.0,7,1006,37,1
Alaska,523.0,8,987,949,0
Washington,62.0,3,849,48,1
Oregon,48.333333,4,973,48,0
Wyoming,73.0,7,1005,435,0


### Step 11. Select rows where df.deaths is greater than 50

In [153]:
army[army["deaths"] > 50]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1,4
California,Nighthawks,1st,52.0,42,957,5,0,24
Florida,Nighthawks,2nd,616.0,2,1400,26,1,2
Iowa,Dragoons,1st,234.0,7,1006,37,1,4
Alaska,Dragoons,2nd,523.0,8,987,949,0,24
Washington,Dragoons,2nd,62.0,3,849,48,1,31
Wyoming,Scouts,1st,73.0,7,1005,435,0,3


### Step 12. Select rows where df.deaths is greater than 500 or less than 50

In [154]:
army[(army["deaths"] > 500) | (army["deaths"] < 50)]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1,4
Texas,Nighthawks,2nd,25.0,2,1099,62,1,31
Florida,Nighthawks,2nd,616.0,2,1400,26,1,2
Maine,Dragoons,1st,43.0,4,1592,73,0,3
Alaska,Dragoons,2nd,523.0,8,987,949,0,24
Oregon,Scouts,1st,48.333333,4,973,48,0,2
Louisana,Scouts,2nd,37.0,8,1099,63,1,2
Georgia,Scouts,2nd,35.0,9,1523,345,1,3


### Step 13. Select all the regiments not named "Dragoons"

In [155]:
army[army["regiment"] != "Dragoons"]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1,4
California,Nighthawks,1st,52.0,42,957,5,0,24
Texas,Nighthawks,2nd,25.0,2,1099,62,1,31
Florida,Nighthawks,2nd,616.0,2,1400,26,1,2
Oregon,Scouts,1st,48.333333,4,973,48,0,2
Wyoming,Scouts,1st,73.0,7,1005,435,0,3
Louisana,Scouts,2nd,37.0,8,1099,63,1,2
Georgia,Scouts,2nd,35.0,9,1523,345,1,3


### Step 14. Select the rows called Texas and Arizona

In [156]:
army.loc[["Texas", "Arizona"]]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Texas,Nighthawks,2nd,25.0,2,1099,62,1,31
Arizona,Nighthawks,1st,523.0,5,1045,1,1,4


### Step 15. Select the third cell in the row named Arizona

In [162]:
army.loc["Arizona"].iloc[2]

523.0

### Step 16. Select the third cell down in the column named deaths

In [163]:
army["deaths"].iloc[2]

25.0