# Fictional Army - Filtering and Sorting

### Import libraries

In [1]:
import numpy as np
import pandas as pd

### Step 1. This is the data given as a dictionary. Create a dataframe and assign it to a variable called army. 

In [2]:
# Create an example dataframe about a fictional army
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
            'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
            'deaths': [523, 52, 25, 616, 43, 234, 523, 62, None, 73, 37, 35],
            'battles': [5, 42, 2, 2, 4, 7, 8, 3, 4, 7, 8, 9],
            'size': [1045, 957, 1099, 1400, 1592, 1006, 987, 849, 973, 1005, 1099, 1523],
            'veterans': [1, 5, 62, 26, 73, 37, 949, 48, 48, 435, 63, 345],
            'readiness': [1, None, 3, None, 2, None, 2, None, None, 1, 2, None],
            'armored': [1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1],
            'deserters': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
            'origin': ['Arizona', 'California', 'Texas', 'Florida', 'Maine', 'Iowa', 'Alaska', 'Washington', 'Oregon', 'Wyoming', 'Louisana', 'Georgia']}

In [3]:
army = pd.DataFrame(raw_data)
army

Unnamed: 0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters,origin
0,Nighthawks,1st,523.0,5,1045,1,1.0,1,4,Arizona
1,Nighthawks,1st,52.0,42,957,5,,0,24,California
2,Nighthawks,2nd,25.0,2,1099,62,3.0,1,31,Texas
3,Nighthawks,2nd,616.0,2,1400,26,,1,2,Florida
4,Dragoons,1st,43.0,4,1592,73,2.0,0,3,Maine
5,Dragoons,1st,234.0,7,1006,37,,1,4,Iowa
6,Dragoons,2nd,523.0,8,987,949,2.0,0,24,Alaska
7,Dragoons,2nd,62.0,3,849,48,,1,31,Washington
8,Scouts,1st,,4,973,48,,0,2,Oregon
9,Scouts,1st,73.0,7,1005,435,1.0,0,3,Wyoming


In [4]:
army.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   regiment   12 non-null     object 
 1   company    12 non-null     object 
 2   deaths     11 non-null     float64
 3   battles    12 non-null     int64  
 4   size       12 non-null     int64  
 5   veterans   12 non-null     int64  
 6   readiness  6 non-null      float64
 7   armored    12 non-null     int64  
 8   deserters  12 non-null     int64  
 9   origin     12 non-null     object 
dtypes: float64(2), int64(5), object(3)
memory usage: 1.1+ KB


### Step 2. Check missing values

In [7]:
army.isnull()

Unnamed: 0,regiment,company,deaths,battles,size,veterans,readiness,armored,deserters,origin
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,True,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,True,False,False,False
6,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,True,False,False,False
8,False,False,True,False,False,False,True,False,False,False
9,False,False,False,False,False,False,False,False,False,False


### Step 3. Drop features, if there are more than 30% missing values.

In [8]:
len(army.columns)*0.70

7.0

In [9]:
army.dropna(axis='columns', thresh=len(army)*0.70, inplace=True)

In [10]:
army

Unnamed: 0,regiment,company,deaths,battles,size,veterans,armored,deserters,origin
0,Nighthawks,1st,523.0,5,1045,1,1,4,Arizona
1,Nighthawks,1st,52.0,42,957,5,0,24,California
2,Nighthawks,2nd,25.0,2,1099,62,1,31,Texas
3,Nighthawks,2nd,616.0,2,1400,26,1,2,Florida
4,Dragoons,1st,43.0,4,1592,73,0,3,Maine
5,Dragoons,1st,234.0,7,1006,37,1,4,Iowa
6,Dragoons,2nd,523.0,8,987,949,0,24,Alaska
7,Dragoons,2nd,62.0,3,849,48,1,31,Washington
8,Scouts,1st,,4,973,48,0,2,Oregon
9,Scouts,1st,73.0,7,1005,435,0,3,Wyoming


### Step 4. Fill missing values with the mean of their regiment.

In [12]:
group_means = army.groupby('regiment')['deaths'].mean()
group_means

regiment
Dragoons      215.500000
Nighthawks    304.000000
Scouts         48.333333
Name: deaths, dtype: float64

In [18]:
army[army['deaths'].isnull()]['regiment'] # Para saber dónde hay NaN

Series([], Name: regiment, dtype: object)

In [14]:
scouts_mean = army[army['regiment'] == 'Scouts']['deaths'].mean()
scouts_mean

np.float64(48.333333333333336)

In [19]:
army.loc[(army['deaths'].isna()), 'deaths'] = scouts_mean

Con el .map para que recorra y el fillna, para que solo coja los valores NAN

In [15]:
army['deaths'].fillna(army['regiment'].map(group_means), inplace= True) # Con el .map

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  army['deaths'].fillna(army['regiment'].map(group_means), inplace= True) # Con el .map


In [16]:
army

Unnamed: 0,regiment,company,deaths,battles,size,veterans,armored,deserters,origin
0,Nighthawks,1st,523.0,5,1045,1,1,4,Arizona
1,Nighthawks,1st,52.0,42,957,5,0,24,California
2,Nighthawks,2nd,25.0,2,1099,62,1,31,Texas
3,Nighthawks,2nd,616.0,2,1400,26,1,2,Florida
4,Dragoons,1st,43.0,4,1592,73,0,3,Maine
5,Dragoons,1st,234.0,7,1006,37,1,4,Iowa
6,Dragoons,2nd,523.0,8,987,949,0,24,Alaska
7,Dragoons,2nd,62.0,3,849,48,1,31,Washington
8,Scouts,1st,48.333333,4,973,48,0,2,Oregon
9,Scouts,1st,73.0,7,1005,435,0,3,Wyoming


In [None]:
army.loc[army['deaths'].isna(), 'deaths'] #Primera mascara por muerte

In [None]:
army.loc[army['deaths'].isna(), 'group_means'] #La convertimos en mascara de media

In [None]:
army.loc[army['deaths'].isna(), 'deaths'] = army.loc[army['deaths'].isna(), 'group_means'] # Se juntan las dos

### Step 5. Set the 'origin' colum as the index of the dataframe

In [21]:
army.set_index('origin', inplace=True)
army

KeyError: "None of ['origin'] are in the columns"

### Step 6. Select the 'deaths', 'size' and 'deserters' columns from Maine and Alaska

In [23]:
army.loc[['Maine','Alaska']][['deaths','size','deserters']]

Unnamed: 0_level_0,deaths,size,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Maine,43.0,1592,3
Alaska,523.0,987,24


### Step 7. Select the rows 3 to 7 and the columns 3 to 6

In [25]:
army.iloc[3:7, 3:6]

Unnamed: 0_level_0,battles,size,veterans
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Florida,2,1400,26
Maine,4,1592,73
Iowa,7,1006,37
Alaska,8,987,949


### Step 8. Select every row after the fourth row and all columns

In [27]:
army.iloc[4:, :]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Maine,Dragoons,1st,43.0,4,1592,73,0,3
Iowa,Dragoons,1st,234.0,7,1006,37,1,4
Alaska,Dragoons,2nd,523.0,8,987,949,0,24
Washington,Dragoons,2nd,62.0,3,849,48,1,31
Oregon,Scouts,1st,48.333333,4,973,48,0,2
Wyoming,Scouts,1st,73.0,7,1005,435,0,3
Louisana,Scouts,2nd,37.0,8,1099,63,1,2
Georgia,Scouts,2nd,35.0,9,1523,345,1,3


### Step 9. Select every row up to the 4th row and all columns

In [28]:
army.iloc[:4, :]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1,4
California,Nighthawks,1st,52.0,42,957,5,0,24
Texas,Nighthawks,2nd,25.0,2,1099,62,1,31
Florida,Nighthawks,2nd,616.0,2,1400,26,1,2


### Step 10. Select the 3rd column up to the 7th column

In [29]:
army.iloc[ 0:, 3:7]

Unnamed: 0_level_0,battles,size,veterans,armored
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Arizona,5,1045,1,1
California,42,957,5,0
Texas,2,1099,62,1
Florida,2,1400,26,1
Maine,4,1592,73,0
Iowa,7,1006,37,1
Alaska,8,987,949,0
Washington,3,849,48,1
Oregon,4,973,48,0
Wyoming,7,1005,435,0


### Step 11. Select rows where df.deaths is greater than 50

In [30]:
deaths_more50 = army[army['deaths'] > 50]
deaths_more50

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1,4
California,Nighthawks,1st,52.0,42,957,5,0,24
Florida,Nighthawks,2nd,616.0,2,1400,26,1,2
Iowa,Dragoons,1st,234.0,7,1006,37,1,4
Alaska,Dragoons,2nd,523.0,8,987,949,0,24
Washington,Dragoons,2nd,62.0,3,849,48,1,31
Wyoming,Scouts,1st,73.0,7,1005,435,0,3


### Step 12. Select rows where df.deaths is greater than 500 or less than 50

In [38]:
army[(army['deaths'] > 500) | (army['deaths'] < 50)]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1,4
Texas,Nighthawks,2nd,25.0,2,1099,62,1,31
Florida,Nighthawks,2nd,616.0,2,1400,26,1,2
Maine,Dragoons,1st,43.0,4,1592,73,0,3
Alaska,Dragoons,2nd,523.0,8,987,949,0,24
Oregon,Scouts,1st,48.333333,4,973,48,0,2
Louisana,Scouts,2nd,37.0,8,1099,63,1,2
Georgia,Scouts,2nd,35.0,9,1523,345,1,3


### Step 13. Select all the regiments not named "Dragoons"

In [220]:
army[army['regiment'] != "Dragoons"]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1,4
California,Nighthawks,1st,52.0,42,957,5,0,24
Texas,Nighthawks,2nd,25.0,2,1099,62,1,31
Florida,Nighthawks,2nd,616.0,2,1400,26,1,2
Oregon,Scouts,1st,48.333333,4,973,48,0,2
Wyoming,Scouts,1st,73.0,7,1005,435,0,3
Louisana,Scouts,2nd,37.0,8,1099,63,1,2
Georgia,Scouts,2nd,35.0,9,1523,345,1,3


### Step 14. Select the rows called Texas and Arizona

In [214]:
army.loc[['Texas','Arizona']]

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Texas,Nighthawks,2nd,25.0,2,1099,62,1,31
Arizona,Nighthawks,1st,523.0,5,1045,1,1,4


### Step 15. Select the third cell in the row named Arizona

In [35]:
army.loc['Arizona'].iloc[3]

np.int64(5)

In [36]:
army

Unnamed: 0_level_0,regiment,company,deaths,battles,size,veterans,armored,deserters
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Arizona,Nighthawks,1st,523.0,5,1045,1,1,4
California,Nighthawks,1st,52.0,42,957,5,0,24
Texas,Nighthawks,2nd,25.0,2,1099,62,1,31
Florida,Nighthawks,2nd,616.0,2,1400,26,1,2
Maine,Dragoons,1st,43.0,4,1592,73,0,3
Iowa,Dragoons,1st,234.0,7,1006,37,1,4
Alaska,Dragoons,2nd,523.0,8,987,949,0,24
Washington,Dragoons,2nd,62.0,3,849,48,1,31
Oregon,Scouts,1st,48.333333,4,973,48,0,2
Wyoming,Scouts,1st,73.0,7,1005,435,0,3


### Step 16. Select the third cell down in the column named deaths

In [37]:
third_death = army['deaths'].iloc[2]
third_death

np.float64(25.0)