# Getting Started with pandas

Pandas will make data cleaning and analysis fast in python. It's designed for working with tabular or heterogeneous data. 

Conventions:```python
import numpy as np
import pandas as pd
```

## Index

## Introduction to pandas Data Structures

### Series

Series is a one-dimensional array-like object containing a sequence of values asociated to a data labels (index).

In [10]:
import pandas as pd

obj = pd.Series([6, 2, -3, 9])
print(obj)

obj2 = pd.Series([5, 3, -6, 8], index=range(10, 50, 10))
print(f"\n{obj}")
print(f"{obj2.index}")
print(f"Object n30: {obj2[30]}")
print(f"Greater than 0: \n{obj2 > 0}"
      # Aplying a filter
      f"\nAnd filtering rows: \n{obj2[obj2 > 0]}")
# Operations like NumPy
print(f"\nIs index 55 in obj2? \n{55 in obj2}")

0    6
1    2
2   -3
3    9
dtype: int64

0    6
1    2
2   -3
3    9
dtype: int64
RangeIndex(start=10, stop=50, step=10)
Object n30: -6
Greater than 0: 
10     True
20     True
30    False
40     True
dtype: bool
And filtering rows: 
10    5
20    3
40    8
dtype: int64

Is index 55 in obj2? 
False


In [15]:
import pandas as pd

dict = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
# Dictionary to pandas Series
obj3 = pd.Series(dict)

print(f"{obj3}")
# pandas Series to dictionary
print(f"\n{obj3.to_dict()}")

# Specifying the index (with miss data)
states = ["California", "Ohio", "Oregon", "Texas"]
obj4 = pd.Series(dict, index=states)
print(f"\n{obj4}") 
# Missing data as NaN (Not a Number)
# Utah has data but is excluded from 'obj4' because the index

print(f"\nLooking for missing data: \n{pd.isna(obj4)}")
print(f"\nLooking for NOT missing data: \n{pd.notna(obj4)}")
# Getting rows with missing data
print(f"\nFiltering missing data: \n{obj4[obj4.isna()]}")

# We can do arithmetic operations with two Series, 
# and automaticaly it aligns the index
print(f"Obj3 + Obj4: \n{obj3 + obj4}")

# atribute naming for Series
obj4.name = "population"
obj4.index.name = "state"
print(f"Obj4 with name attribute: \n{obj4}")

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

Looking for missing data: 
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

Looking for NOT missing data: 
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

Filtering missing data: 
California   NaN
dtype: float64
Obj3 + Obj4: 
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
Obj4 with name attribute: 
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


### DataFrame

A DataFrame is a table of data wutg irdered and named collection of collumns that can be different value type and it has both a row and column index. You can create a DataFrame with a dictionary of equal length lists or NumPy arrays.

Asigning new values to a column in the DataFrame, the new data in the Series or Array must match the length of the dataframe. If we assign a Series, it's labels will be realigned to the DataFrame's index, inserting missing values in any index values not present.

In [21]:
import pandas as pd
from pandas import DataFrame

data = {
    "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada", "Oregon",
              "Oregon", "Texas", "Texas",],
    "year": [2000, 2001, 2002, 2001, 2002, 2003, 2002, 2003, 2004, 2005],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 2.4, 3.1, 3.3, 2.9]
    }

# We can define columns order making the DataFrame
df = DataFrame(data, columns=["year", "state", "pop"])
print(f"{df}")

# For large DataFrame, we can see the first 5 rows with 'head()'
print(f"\n{df.head()}")
# And we can see the las 5 rows with 'tail()'
print(f"\n{df.tail()}")

   year   state  pop
0  2000    Ohio  1.5
1  2001    Ohio  1.7
2  2002    Ohio  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9
5  2003  Nevada  3.2
6  2002  Oregon  2.4
7  2003  Oregon  3.1
8  2004   Texas  3.3
9  2005   Texas  2.9

   year   state  pop
0  2000    Ohio  1.5
1  2001    Ohio  1.7
2  2002    Ohio  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9

   year   state  pop
5  2003  Nevada  3.2
6  2002  Oregon  2.4
7  2003  Oregon  3.1
8  2004   Texas  3.3
9  2005   Texas  2.9


In [3]:
import pandas as pd

data = {
    "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada", "Oregon",
              "Oregon", "Texas", "Texas",],
    "year": [2000, 2001, 2002, 2001, 2002, 2003, 2002, 2003, 2004, 2005],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 2.4, 3.1, 3.3, 2.9]
    }

# Adding an extra column for missing data
df2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
print(f"Columns df2: {df2.columns}")
print(f"States: \n{df2['state']}"
      f"\nPrinting years: \n{df2.year}")
print(f"\nRetrieving by position: \n{df2.iloc[2]}")

Columns df2: Index(['year', 'state', 'pop', 'debt'], dtype='object')
States: 
0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
6    Oregon
7    Oregon
8     Texas
9     Texas
Name: state, dtype: object
Printing years: 
0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
6    2002
7    2003
8    2004
9    2005
Name: year, dtype: int64

Retrieving by position: 
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 2, dtype: object


In [16]:
import pandas as pd
import numpy as np

data = {
    "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada", "Oregon",
              "Oregon", "Texas", "Texas",],
    "year": [2000, 2001, 2002, 2001, 2002, 2003, 2002, 2003, 2004, 2005],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2, 2.4, 3.1, 3.3, 2.9]
    }
df2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])

# Modifying 'debt' column with array 
df2["debt"] = np.arange(1.,11.0)

# Creating a 'eastern' column for 'Ohio', as boolean
df2["eastern"] = df2["state"] == "Ohio"
print(f"{df2}")

# Removing "eastern" column
del df2["eastern"]
print(f"\nDeleting 'eastern' column: {df2.columns}")

"""
    The column returned from indexing a DataFrame is a 'view', not a copy, 
    thus in-place modifications to Seres will be reflected in the DataFrame.
    The column can be copied with 'copy' method from Series.
"""

   year   state  pop  debt  eastern
0  2000    Ohio  1.5   1.0     True
1  2001    Ohio  1.7   2.0     True
2  2002    Ohio  3.6   3.0     True
3  2001  Nevada  2.4   4.0    False
4  2002  Nevada  2.9   5.0    False
5  2003  Nevada  3.2   6.0    False
6  2002  Oregon  2.4   7.0    False
7  2003  Oregon  3.1   8.0    False
8  2004   Texas  3.3   9.0    False
9  2005   Texas  2.9  10.0    False

Deleting 'eastern' column: Index(['year', 'state', 'pop', 'debt'], dtype='object')


p.138(152)