# Dataframes

1) Sorting & Subsetting(Columns & Rows)- isin() method

2) Creating New Columns

In [8]:
import pandas as pd

homelessness = pd.read_csv('./Data/homelessness.csv', index_col = 0)

print(homelessness.head())

               region       state  individuals  family_members  state_pop
0  East South Central     Alabama       2570.0           864.0    4887681
1             Pacific      Alaska       1434.0           582.0     735139
2            Mountain     Arizona       7259.0          2606.0    7158024
3  West South Central    Arkansas       2280.0           432.0    3009733
4             Pacific  California     109008.0         20964.0   39461588


### describe()

In [21]:
homelessness.describe()

Unnamed: 0,individuals,family_members,state_pop
count,51.0,51.0,51.0
mean,7225.784314,3504.882353,6405637.0
std,15991.025083,7805.411811,7327258.0
min,434.0,75.0,577601.0
25%,1446.5,592.0,1777414.0
50%,3082.0,1482.0,4461153.0
75%,6781.5,3196.0,7340946.0
max,109008.0,52070.0,39461590.0


### Values()

In [54]:
homelessness.head().values

array([['East South Central', 'Alabama', 2570.0, 864.0, 4887681, 25.7],
       ['Pacific', 'Alaska', 1434.0, 582.0, 735139, 14.34],
       ['Mountain', 'Arizona', 7259.0, 2606.0, 7158024, 72.59],
       ['West South Central', 'Arkansas', 2280.0, 432.0, 3009733, 22.8],
       ['Pacific', 'California', 109008.0, 20964.0, 39461588, 1090.08]],
      dtype=object)

### Columns

In [56]:
homelessness.head().columns

Index(['region', 'state', 'individuals', 'family_members', 'state_pop',
       'individuals/100'],
      dtype='object')

### Index

In [55]:
homelessness.head().index

Int64Index([0, 1, 2, 3, 4], dtype='int64')

## Sorting

In [57]:
# Sorting with Single Variable

homelessness.sort_values("individuals", ascending=False).head()

Unnamed: 0,region,state,individuals,family_members,state_pop,individuals/100
4,Pacific,California,109008.0,20964.0,39461588,1090.08
32,Mid-Atlantic,New York,39827.0,52070.0,19530351,398.27
9,South Atlantic,Florida,21443.0,9587.0,21244317,214.43
43,West South Central,Texas,19199.0,6111.0,28628666,191.99
47,Pacific,Washington,16424.0,5880.0,7523869,164.24


### Sorting with Single Variable

In [30]:
# Sorting by multiple variables

homelessness.sort_values(["individuals", "family_members"], ascending=[True, False]).head()

Unnamed: 0,region,state,individuals,family_members,state_pop
50,Mountain,Wyoming,434.0,205.0,577601
34,West North Central,North Dakota,467.0,75.0,758080
7,South Atlantic,Delaware,708.0,374.0,965479
39,New England,Rhode Island,747.0,354.0,1058287
45,New England,Vermont,780.0,511.0,624358


### Subsetting multiple columns

In [29]:
# Subsetting multiple columns
homelessness[["state", "individuals"]].head()

Unnamed: 0,state,individuals
0,Alabama,2570.0
1,Alaska,1434.0
2,Arizona,7259.0
3,Arkansas,2280.0
4,California,109008.0


In [32]:
# Subsetting multiple columns
subset = ["state", "individuals"]
homelessness[subset].head()

Unnamed: 0,state,individuals
0,Alabama,2570.0
1,Alaska,1434.0
2,Arizona,7259.0
3,Arkansas,2280.0
4,California,109008.0


### Subsetting rows

In [35]:
homelessness[homelessness["individuals"] > 10000]

Unnamed: 0,region,state,individuals,family_members,state_pop
4,Pacific,California,109008.0,20964.0,39461588
9,South Atlantic,Florida,21443.0,9587.0,21244317
32,Mid-Atlantic,New York,39827.0,52070.0,19530351
37,Pacific,Oregon,11139.0,3337.0,4181886
43,West South Central,Texas,19199.0,6111.0,28628666
47,Pacific,Washington,16424.0,5880.0,7523869


### Subsetting based on text data

In [39]:
# Subsetting based on text data
homelessness[homelessness["state"] == "California"]

Unnamed: 0,region,state,individuals,family_members,state_pop
4,Pacific,California,109008.0,20964.0,39461588


### Subsetting based on dates

In [41]:
### There is no date in the data that we used!

### Subsetting based on multiple conditions

In [43]:
# Subsetting based on multiple conditions
homelessness[(homelessness["state"] == "New York") & (homelessness["region"] == "Mid-Atlantic")]

Unnamed: 0,region,state,individuals,family_members,state_pop
32,Mid-Atlantic,New York,39827.0,52070.0,19530351


### Subsetting using .isin()  [Better Alternative!!!]

In [48]:
# Subsetting using .isin()
person = homelessness["state"].isin(["New York", "California"])
homelessness[person]

Unnamed: 0,region,state,individuals,family_members,state_pop,individuals/100
4,Pacific,California,109008.0,20964.0,39461588,1090.08
32,Mid-Atlantic,New York,39827.0,52070.0,19530351,398.27


### New columns

### Adding a new column

In [51]:
# Adding a new column
homelessness["individuals/100"] = homelessness["individuals"] / 100
print(homelessness.head())

               region       state  individuals  family_members  state_pop  \
0  East South Central     Alabama       2570.0           864.0    4887681   
1             Pacific      Alaska       1434.0           582.0     735139   
2            Mountain     Arizona       7259.0          2606.0    7158024   
3  West South Central    Arkansas       2280.0           432.0    3009733   
4             Pacific  California     109008.0         20964.0   39461588   

   individuals/100  
0            25.70  
1            14.34  
2            72.59  
3            22.80  
4          1090.08  


### Multiple Manipulation

In [53]:
# Multiple Manipulation
homelessness_1 = homelessness[homelessness["individuals"] > 1500.0]
homelessness_sorted = homelessness_1.sort_values("individuals", ascending=False)
homelessness_sorted[["individuals", "state"]].head()

Unnamed: 0,individuals,state
4,109008.0,California
32,39827.0,New York
9,21443.0,Florida
43,19199.0,Texas
47,16424.0,Washington
