In [1]:
import pandas as pd
import numpy as np

In [None]:
# !pip3 install pandas --upgrade 

In [3]:
pd.__version__

'1.0.0'

# What is Pandas?

Pandas can be thought as an enhanced version of numpy arrays. In this case, the rows and columns can be identified with labels instead of just simple integer indices.

There are **3** main pandas elements we **need** to understand.
1. Pandas Series
2. Pandas DataFrame
3. Index

# The Pandas Series

A pandas series is a one-dimensional (**1-D**) indexed array.

In [6]:
pd

<module 'pandas' from '/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/__init__.py'>

In [7]:
pd.Series()

  pd.Series()


Series([], dtype: float64)

In [8]:
pd.Series([1,2,3])

0    1
1    2
2    3
dtype: int64

In [10]:
np.array([1,2,3])[1]

2

In [11]:
pd.Series([1,2,3])[1]

2

In [12]:
pd.Series(['a','b','c'])

0    a
1    b
2    c
dtype: object

In [13]:
pd.Series(['a', 2, 3])

0    a
1    2
2    3
dtype: object

In [16]:
type(pd.Series(['a', 2, 3])[0])

str

In [19]:
data = pd.Series([10,23,3,43,25,136])

In [23]:
data

0     10
1     23
2      3
3     43
4     25
5    136
dtype: int64

## Values and Indexes

In [25]:
type(data)

pandas.core.series.Series

In [26]:
data

0     10
1     23
2      3
3     43
4     25
5    136
dtype: int64

In [27]:
data.values

array([ 10,  23,   3,  43,  25, 136])

In [28]:
data.index

RangeIndex(start=0, stop=6, step=1)

In [30]:
data.index.values

array([0, 1, 2, 3, 4, 5])

In [31]:
type(data.values)

numpy.ndarray

In [32]:
data.index

RangeIndex(start=0, stop=6, step=1)

In [34]:
type(data.index)

pandas.core.indexes.range.RangeIndex

## Accessing elements 

Can be done like a numpy array

In [35]:
data

0     10
1     23
2      3
3     43
4     25
5    136
dtype: int64

In [36]:
data[0]

10

In [37]:
data[4:]

4     25
5    136
dtype: int64

Em resumo: pandas series pode ser considerado uma numpy array de 1-D

### What is the difference then? Numpy array vs Pandas Series

Mostly the index notation.

Numpy arrays only have the **implicit** index associated with its location. By using a **explicit** index notation, Pandas Series are much more flexible. For example:

## Index don't need to be numbers.

In [38]:
pd.Series([1,2,3])

0    1
1    2
2    3
dtype: int64

In [44]:
x = pd.Series(data=[1,2,3], index=[0,10,2])
x

0     1
10    2
2     3
dtype: int64

In [47]:
x[10]

2

In [50]:
data = pd.Series(data=[1,2,3,4], 
                 index=['ajaiofj','b', 'caksok', 'd'])

In [58]:
data.index.values

array(['ajaiofj', 'b', 'caksok', 'd'], dtype=object)

### Then how can I access this pandas series?

In [53]:
data['ajaiofj']

1

## Index don't need to be in sequence

In [61]:
data = pd.Series(data=[1,2,3,4], 
                 index=[1,7,4313,19])

In [62]:
data

1       1
7       2
4313    3
19      4
dtype: int64

### One can think of a pandas series, then, as a form of dictionary

Let's create a pandas series from a dict.

In [64]:
data.keys()

Int64Index([1, 7, 4313, 19], dtype='int64')

In [66]:
my_dict = {'FERNANDA': 0, 'ANDRE':10}

In [68]:
pd.Series(my_dict)

FERNANDA     0
ANDRE       10
dtype: int64

In [69]:
my_dict = {0: 0, 1:10}
pd.Series(my_dict)

0     0
1    10
dtype: int64

In [226]:
my_dict = {'FERNANDA': 0, 'ANDRE':20, 'FERNANDA': -10}
data = pd.Series(my_dict)

In [227]:
pd.Series(my_dict)[0]

-10

# What about > 1-D?


# Pandas DataFrame


Pandas Dataframes can be thought as a generalization of **2-D** numpy arrays. However, again, they bring flexibility on both the indices and column names.

In [228]:
pd.DataFrame()

In [229]:
type(pd.DataFrame())

pandas.core.frame.DataFrame

## Pandas DataFrame as a set of Pandas Series

In [230]:
data

FERNANDA   -10
ANDRE       20
dtype: int64

In [231]:
another_dict = {'FERNANDA': -20,'ANDRE': 22}

data_2 = pd.Series(another_dict)

# Create dataframe as a collection of Series

In [232]:
pd.DataFrame({'a':data, 'b':data_2})

Unnamed: 0,a,b
FERNANDA,-10,-20
ANDRE,20,22


In [99]:
pd.DataFrame(data=[1,2,3])

Unnamed: 0,0
0,1
1,2
2,3


In [106]:
pd.DataFrame(data=[1,2,3], columns=['ironhack'])

Unnamed: 0,ironhack
0,1
1,2
2,3


In [110]:
np.array([1,2,3])

array([1, 2, 3])

In [109]:
np.array([[1,2,3],[-5,-6,-7]])

array([[ 1,  2,  3],
       [-5, -6, -7]])

In [111]:
pd.DataFrame(data=[[1,2,3],[-5,-6,-7]], columns=['ironhack','digital_house','lewagon'])

Unnamed: 0,ironhack,digital_house,lewagon
0,1,2,3
1,-5,-6,-7


In [118]:
type(np.nan)

float

In [122]:
df = pd.DataFrame(data={'ironhack':[np.nan, -5], 'digital_house':[2,-6], 'lewagon':[3,-7], 'outra':['a', 'b']})
df

Unnamed: 0,ironhack,digital_house,lewagon,outra
0,,2,3,a
1,-5.0,-6,-7,b


In [125]:
pd.DataFrame(data={'ironhack':[np.nan, -5], 'digital_house':[2,-6]}, index=['FERNANDA', 'ANDRE'])

Unnamed: 0,ironhack,digital_house
FERNANDA,,2
ANDRE,-5.0,-6


### Column names and indexes

In [134]:
df

Unnamed: 0,ironhack,digital_house,lewagon,outra
0,,2,3,a
1,-5.0,-6,-7,b


In [127]:
df['digital_house']

0    2
1   -6
Name: digital_house, dtype: int64

In [135]:
type(df['digital_house'])

pandas.core.series.Series

In [142]:
df[['digital_house','lewagon']]

Unnamed: 0,digital_house,lewagon
0,2,3
1,-6,-7


In [144]:
df[['ironhack']]

Unnamed: 0,ironhack
0,
1,-5.0


# Creating dataframes

In [147]:
pd.DataFrame(data=[[1,2,3,4],[3,4,6,7]])

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,3,4,6,7


## From a list

In [148]:
pd.DataFrame(data=[1,2,3,4], columns='ironhack')

TypeError: Index(...) must be called with a collection of some kind, 'ironhack' was passed

In [149]:
data = pd.DataFrame(data=[1,2,3,4], columns=['ironhack'])
data

Unnamed: 0,ironhack
0,1
1,2
2,3
3,4


In [150]:
data['ironhack']

0    1
1    2
2    3
3    4
Name: ironhack, dtype: int64

## From a Pandas Series

In [153]:
pd.DataFrame([pd.Series([1,2,4,5]), pd.Series([3,5,1,3])])

Unnamed: 0,0,1,2,3
0,1,2,4,5
1,3,5,1,3


## From a dictionary

In [155]:
df = pd.DataFrame({'ironhack':[1,3,5,6,3,2], 'digital': [1,3,2,4,3,1]} )

In [157]:
df[['ironhack']]

Unnamed: 0,ironhack
0,1
1,3
2,5
3,6
4,3
5,2


## From a dictionary composed by pandas Series

In [163]:
serie_1 = pd.Series([1,2,3])
serie_2 = pd.Series([3,4,6])

pd.DataFrame({'a': serie_1, 'b':serie_2})
#serie_2

Unnamed: 0,a,b
0,1,3
1,2,4
2,3,6


In [176]:
pd.DataFrame({'a': [1,35,67,3], 'b':[1,3,4,6]})

Unnamed: 0,a,b
0,1,1
1,35,3
2,67,4
3,3,6


## From a dictionary composed by lists

In [179]:
pd.DataFrame({'ironhack_students': ['Rodrigo','Rodrigo','Rodrigo'],
              'linkedin_picture_grade':[10, 10, 0]})

Unnamed: 0,ironhack_students,linkedin_picture_grade
0,Rodrigo,10
1,Rodrigo,10
2,Rodrigo,0


## From a numpy array

In [180]:
a = np.random.random(size=(5, 2))
a

array([[0.31536718, 0.10037219],
       [0.170803  , 0.62645801],
       [0.91071227, 0.65703016],
       [0.61966536, 0.64871687],
       [0.28300328, 0.70575633]])

In [181]:
pd.DataFrame(a)

Unnamed: 0,0,1
0,0.315367,0.100372
1,0.170803,0.626458
2,0.910712,0.65703
3,0.619665,0.648717
4,0.283003,0.705756


In [182]:
pd.DataFrame(a, columns=['a', 'b'])

Unnamed: 0,a,b
0,0.315367,0.100372
1,0.170803,0.626458
2,0.910712,0.65703
3,0.619665,0.648717
4,0.283003,0.705756


In [192]:
x = pd.DataFrame([pd.Series([1,2],index=['Fernanda','Andre']), pd.Series([4,7], index=['Fernanda', 'Andre'])])

In [197]:
x.transpose()

Unnamed: 0,0,1
Fernanda,1,4
Andre,2,7


In [203]:
x.mean()

Fernanda    2.5
Andre       4.5
dtype: float64

In [200]:
x.std()

Fernanda    2.121320
Andre       3.535534
dtype: float64

In [201]:
x.describe()

Unnamed: 0,Fernanda,Andre
count,2.0,2.0
mean,2.5,4.5
std,2.12132,3.535534
min,1.0,2.0
25%,1.75,3.25
50%,2.5,4.5
75%,3.25,5.75
max,4.0,7.0


In [206]:
x['Andre']


0    2
1    7
Name: Andre, dtype: int64

In [207]:
x.Andre

0    2
1    7
Name: Andre, dtype: int64

In [208]:
type(x)

pandas.core.frame.DataFrame

In [210]:
x.mean(axis=1)

0    1.5
1    5.5
dtype: float64

# Pandas Index

In [211]:
pd.Index([1,2,3])

Int64Index([1, 2, 3], dtype='int64')

In [212]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [213]:
data.index.values

array([0, 1, 2, 3])

In [223]:
pd.DataFrame([item for item in range(10)], index=[np.random.randint(0, 1000) for item in range(10)], columns=['colname']).query('colname > 2 and colname < 6')

Unnamed: 0,colname
658,3
329,4
807,5


In [224]:
pd.__version__

'1.0.0'

In [None]:
a = 256
b = 256

In [None]:
a == b

In [None]:
a is b

In [None]:
a = 258
b = 258

In [None]:
a is b

In [None]:
a = 'python'
b = 'python'

In [None]:
a is b

In [None]:
a = 'python 3'
b = 'python 3'

In [None]:
a is b