# Data Manipulation with pandas
Run the hidden code cell below to import the data used in this course.

In [2]:
# Import the course packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import the four datasets
avocado = pd.read_csv("datasets/avocado.csv")
homelessness = pd.read_csv("datasets/homelessness.csv")
temperatures = pd.read_csv("datasets/temperatures.csv")
walmart = pd.read_csv("datasets/walmart.csv")

# Methods in Pandas

###  1. shape: This method returns a tuple representing the dimensions of the DataFrame,(number of rows, number of column)

In [3]:
print(homelessness.shape)

(51, 5)


# Attributes in Pandas

1. info():This method provides a summary of the DataFrame, including the number of rows, columns, data types, memory usage, and the number of non-null values in each column.

In [4]:
print(homelessness.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   region          51 non-null     object 
 1   state           51 non-null     object 
 2   individuals     51 non-null     float64
 3   family_members  51 non-null     float64
 4   state_pop       51 non-null     int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 2.1+ KB
None


2. head():This method returns the first n rows of the DataFrame (default is 5).

In [5]:
print(homelessness.head())

               region       state  individuals  family_members  state_pop
0  East South Central     Alabama       2570.0           864.0    4887681
1             Pacific      Alaska       1434.0           582.0     735139
2            Mountain     Arizona       7259.0          2606.0    7158024
3  West South Central    Arkansas       2280.0           432.0    3009733
4             Pacific  California     109008.0         20964.0   39461588


3. describe():This method returns a summary statistics of the numerical columns in the DataFrame.

In [6]:
print(homelessness.describe())

         individuals  family_members     state_pop
count      51.000000       51.000000  5.100000e+01
mean     7225.784314     3504.882353  6.405637e+06
std     15991.025083     7805.411811  7.327258e+06
min       434.000000       75.000000  5.776010e+05
25%      1446.500000      592.000000  1.777414e+06
50%      3082.000000     1482.000000  4.461153e+06
75%      6781.500000     3196.000000  7.340946e+06
max    109008.000000    52070.000000  3.946159e+07


# Parts of Dataframe

1. Values: Two-dimensional array(row-column) of values

In [8]:
print(homelessness.values)

[['East South Central' 'Alabama' 2570.0 864.0 4887681]
 ['Pacific' 'Alaska' 1434.0 582.0 735139]
 ['Mountain' 'Arizona' 7259.0 2606.0 7158024]
 ['West South Central' 'Arkansas' 2280.0 432.0 3009733]
 ['Pacific' 'California' 109008.0 20964.0 39461588]
 ['Mountain' 'Colorado' 7607.0 3250.0 5691287]
 ['New England' 'Connecticut' 2280.0 1696.0 3571520]
 ['South Atlantic' 'Delaware' 708.0 374.0 965479]
 ['South Atlantic' 'District of Columbia' 3770.0 3134.0 701547]
 ['South Atlantic' 'Florida' 21443.0 9587.0 21244317]
 ['South Atlantic' 'Georgia' 6943.0 2556.0 10511131]
 ['Pacific' 'Hawaii' 4131.0 2399.0 1420593]
 ['Mountain' 'Idaho' 1297.0 715.0 1750536]
 ['East North Central' 'Illinois' 6752.0 3891.0 12723071]
 ['East North Central' 'Indiana' 3776.0 1482.0 6695497]
 ['West North Central' 'Iowa' 1711.0 1038.0 3148618]
 ['West North Central' 'Kansas' 1443.0 773.0 2911359]
 ['East South Central' 'Kentucky' 2735.0 953.0 4461153]
 ['West South Central' 'Louisiana' 2540.0 519.0 4659690]
 ['New 

2. columns: An index of all the columns.

In [9]:
print(homelessness.columns)

Index(['region', 'state', 'individuals', 'family_members', 'state_pop'], dtype='object')


3. index: An index of all the rows.

In [12]:
print(homelessness.index)

RangeIndex(start=0, stop=51, step=1)


# Sorting rows

Sort the rows according to the column:

df.sort_values(["column_name1","column_name2"],ascending=[true or false])

In [19]:
family=homelessness.sort_values(["region","state_pop"],ascending=[True,False])
print(family.head())

                region      state  individuals  family_members  state_pop
13  East North Central   Illinois       6752.0          3891.0   12723071
35  East North Central       Ohio       6929.0          3320.0   11676341
22  East North Central   Michigan       5209.0          3142.0    9984072
14  East North Central    Indiana       3776.0          1482.0    6695497
49  East North Central  Wisconsin       2740.0          2167.0    5807406


# Subsetting columns

Subsetting in simple words means to query out the data.

## Query out the specific columns.

In [21]:
family=homelessness[["state","region"]]
print(family.head())

        state              region
0     Alabama  East South Central
1      Alaska             Pacific
2     Arizona            Mountain
3    Arkansas  West South Central
4  California             Pacific


## Filtering rows on the basis of some conditions

- Here the datatype is bool

In [25]:
data_gt_10k=homelessness["individuals"]>10000
print(data_gt_10k.head())

0    False
1    False
2    False
3    False
4     True
Name: individuals, dtype: bool


- if we need to query out the information 

In [26]:
data_gt_10k=homelessness[homelessness["individuals"]>10000]
print(data_gt_10k.head())

                region       state  individuals  family_members  state_pop
4              Pacific  California     109008.0         20964.0   39461588
9       South Atlantic     Florida      21443.0          9587.0   21244317
32        Mid-Atlantic    New York      39827.0         52070.0   19530351
37             Pacific      Oregon      11139.0          3337.0    4181886
43  West South Central       Texas      19199.0          6111.0   28628666


### Question:

Filter for rows where family_members is less than 1000 and region is Pacific

In [28]:
infor=homelessness[(homelessness["family_members"]<1000) & (homelessness["region"]=="Pacific")]
print(infor)

    region   state  individuals  family_members  state_pop
1  Pacific  Alaska       1434.0           582.0     735139
