In [1]:
import numpy as np
import pandas as pd

# The fundamental data structures are Series and DataFrame

In [2]:
s = pd.Series(np.random.randn(4), index =['a','f','t','y'])

In [3]:
s

a    1.447413
f    0.788151
t   -0.051960
y    0.239071
dtype: float64

In [4]:
s['a']

1.4474134996302044

## The data argument can be dict, ndarray, or even a scalar etc

In [8]:
data = {'a': 3, 'b':4, 'c': 5, 'f':'something_else'}

In [9]:
data

{'a': 3, 'b': 4, 'c': 5, 'f': 'something_else'}

In [10]:
s_dict = pd.Series(data, index=data.keys())

In [11]:
s_dict

a                 3
b                 4
c                 5
f    something_else
dtype: object

## When the dict doesn't have a matching key, it's not added.
## And when the index key is missing a value attached, NaN is given

In [16]:
s_dict_with_nan = pd.Series(data, index = ['a','b', 'j'])

In [17]:
s_dict_with_nan

a    3.0
b    4.0
j    NaN
dtype: float64

## Works differently with scalars

In [25]:
s_scalar = pd.Series(3, index=range(5))

Works the same way even if you pass a list with one element, like [3] but fails if you pass [3,4] because expects 5 and not 2

In [26]:
s_scalar

0    3
1    3
2    3
3    3
4    3
dtype: int64

## Series works just like an ndarray, if you have worked with numpy before. I do not have a lot of experience with numpy so can't comment on the full capabilites but according to what I know, you can apply vectorized operations to get a better code performance, slice in the same way we do with numpy ndarrays. 

In [32]:
s

a    1.447413
f    0.788151
t   -0.051960
y    0.239071
dtype: float64

## MENTIONING PYTHON DATA TYPE IN VARIABLE NAME IS NOT GOOD PRACTICE, SO      TRY NOT TO 

In [29]:
s[s>s.median()]

a    1.447413
f    0.788151
dtype: float64

In [31]:
s[[3, 0, 1]]

y    0.239071
a    1.447413
f    0.788151
dtype: float64

In [35]:
np.exp(s)

a    4.252102
f    2.199325
t    0.949367
y    1.270069
dtype: float64

In [36]:
s.values

array([ 1.4474135 ,  0.78815063, -0.05195973,  0.23907112])

In [37]:
s.keys

<bound method Series.keys of a    1.447413
f    0.788151
t   -0.051960
y    0.239071
dtype: float64>

In [38]:
s.keys()

Index(['a', 'f', 't', 'y'], dtype='object')

In [39]:
s.index


Index(['a', 'f', 't', 'y'], dtype='object')

In [43]:
try :
    some_random_var = s['g']   #   Raises key error
except KeyError :
    print('Caught key Error')
    

Caught key Error


In [46]:
s.f # Can also access elements this way

0.78815063277592512

In [47]:
s.a

1.4474134996302044

In [48]:
s

a    1.447413
f    0.788151
t   -0.051960
y    0.239071
dtype: float64

### Vectorized operations - Start of the end of matlab RIP

In [49]:
s+s          

a    2.894827
f    1.576301
t   -0.103919
y    0.478142
dtype: float64

In [50]:
s*2

a    2.894827
f    1.576301
t   -0.103919
y    0.478142
dtype: float64

In [51]:
s*3

a    4.342240
f    2.364452
t   -0.155879
y    0.717213
dtype: float64

In [52]:
s/2

a    0.723707
f    0.394075
t   -0.025980
y    0.119536
dtype: float64

# Moving to Pandas DataFrame

### According to definition from https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dsintro

<font size=4>__DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. It is generally the most commonly used pandas object. Like Series, DataFrame accepts many different kinds of input: __</font>

- Dict of 1D ndarrays, lists, dicts, or Series
- 2-D numpy.ndarray
- Structured or record ndarray
- A Series
- Another DataFrame 

<font size=4>__Along with the data, you can optionally pass index (row labels) and columns (column labels) arguments.__ </font>

In [68]:
data = {'one': pd.Series([1,2,3,4], index=['a','b','c','d']),
        'two': pd.Series([3,4,5,56,6], index=['a','b','f','e','y'])}

In [69]:
df = pd.DataFrame(data)

In [70]:
df


Unnamed: 0,one,two
a,1.0,3.0
b,2.0,4.0
c,3.0,
d,4.0,
e,,56.0
f,,5.0
y,,6.0


As you can see that it merged the two index together and filled the rest of the values with NaN 

In [76]:
df_1 = pd.DataFrame(data, index=['a','b','c','f'])

In [77]:
df_1

Unnamed: 0,one,two
a,1.0,3.0
b,2.0,4.0
c,3.0,
f,,5.0


In [79]:
df_2 = pd.DataFrame(data, columns=['two'])

In [80]:
df_2

Unnamed: 0,two
a,3
b,4
f,5
e,56
y,6


In [81]:
df

Unnamed: 0,one,two
a,1.0,3.0
b,2.0,4.0
c,3.0,
d,4.0,
e,,56.0
f,,5.0
y,,6.0


In [84]:
df.index

Index(['a', 'b', 'c', 'd', 'e', 'f', 'y'], dtype='object')

In [85]:
df.columns

Index(['one', 'two'], dtype='object')

In [103]:
data_for_dict = { 'one': [1,2,4,5],
                  'two': [2.,5.4,4.,5]}

In [104]:
df_from_dict = pd.DataFrame(data_for_dict)

In [106]:
df_from_dict['one']

array([1, 2, 4, 5], dtype=int64)

In [124]:
data = np.ones((2,8))
data

array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]])

In [125]:
data.keys()

AttributeError: 'numpy.ndarray' object has no attribute 'keys'

In [111]:
s = pd.Series([3,4,5,5])

In [112]:
s

0    3
1    4
2    5
3    5
dtype: int64

In [119]:
s.values

array([3, 4, 5, 5], dtype=int64)

In [117]:
s.keys()

RangeIndex(start=0, stop=4, step=1)

__A difference to note here is that Series does have a values and keys attribute whereas an ndarray doesn't. Good to know__

In [126]:
rows = [[1,2,3,43],[3,5,6,6],[66,6,6]]

In [127]:
df_rows = pd.DataFrame(rows)

In [128]:
df_rows

Unnamed: 0,0,1,2,3
0,1,2,3,43.0
1,3,5,6,6.0
2,66,6,6,


In [130]:
df_rows_with_index = pd.DataFrame(rows, index=['first', 'second','third'])

In [131]:
df_rows_with_index    # Number of indices should always match the rows count else will raise a shape error

Unnamed: 0,0,1,2,3
first,1,2,3,43.0
second,3,5,6,6.0
third,66,6,6,


In [159]:

try:
    df_test = pd.DataFrame({'one':[2,23,4,5,56],
                        'two': [1,3,4,4]})
except ValueError as e:
    print('Value error raised')
    print(e)

Value error raised
arrays must all be same length


In [160]:
df_test_2 = pd.DataFrame([[1,2,3,4],[2,3,3]])

In [161]:
df_test_2

Unnamed: 0,0,1,2,3
0,1,2,3,4.0
1,2,3,3,


In [162]:
df_test_3 = pd.DataFrame([[1,2,3,4],[2,3,3]], columns=['one','two','three',
                                                      'four'])

In [163]:
df_test_3

Unnamed: 0,one,two,three,four
0,1,2,3,4.0
1,2,3,3,


As we can see from the above couple of test df, when we pass a dictionary as data to DataFrame, the arrays needs to be of the same length. But if we pass a row of rows, they are adjusted accordingly
<br>
Strange, but need to know the reasoning.


In [169]:
df_test_4

Unnamed: 0,0,1,2,3
0,3,5,5,6.0
1,5,5,6,
