# Introduction to Data Structures.

In [1]:
import numpy as np

import pandas as pd

## Series

In [2]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d","e"])
s

a   -2.348531
b    0.881603
c   -0.147338
d   -0.360273
e    0.390224
dtype: float64

In [3]:
#from dictionary.

d = {"b": 1, "a":0, "c":2}
pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [5]:
#changing the series index.

d = {"a": 0.0, "b": 1.0, "c": 2.0}

pd.Series(d)

pd.Series(d, index=["b", "c", "d", "a"])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

## Series is ndarray-like

In [8]:
s.iloc[0]

-2.3485314236258703

In [9]:
s.iloc[:3]

a   -2.348531
b    0.881603
c   -0.147338
dtype: float64

In [10]:
s[s > s.median()]

b    0.881603
e    0.390224
dtype: float64

In [11]:
s.iloc[[4, 3, 1]]

e    0.390224
d   -0.360273
b    0.881603
dtype: float64

In [12]:
np.exp(s)

a    0.095509
b    2.414766
c    0.863002
d    0.697486
e    1.477312
dtype: float64

In [13]:
s.array

<PandasArray>
[ -2.3485314236258703,   0.8816025798258389, -0.14733835744763515,
  -0.3602725080299774,  0.39022406072431487]
Length: 5, dtype: float64

In [15]:
s["e"]

0.39022406072431487

In [16]:
s["e"] = 12
s["e"]

12.0

In [17]:
"e" in s

True

In [18]:
"f" in s

False

### The Series name attribute

In [19]:
s = pd.Series(np.random.randn(5), name="something")
s

0    1.436688
1    0.944048
2   -0.308283
3   -0.880457
4   -1.485777
Name: something, dtype: float64

In [20]:
s2 = s.rename("different name")
s2.name

'different name'

## DataFrame

In [21]:
d = {
    "one": pd.Series([1.0, 2.0, 3.0], index=["a","b","c"]),
    "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c","d"])    
}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [23]:
pd.DataFrame(d, index=["d", "b", "a"])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [24]:
pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "three"])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


### From Dict of ndarrays / lists

In [25]:
d = {
    "one": [1.0, 2.0, 3.0, 4.0],
    "two": [4.0, 3.0, 2.0, 1.0]
}
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [26]:
pd.DataFrame(d, index = ["a", "b", "c", "d"])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


### From Structures or record array

In [28]:
#This case is handled identically to a dict of arrays.

data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])

data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]

pd.DataFrame(data)

Unnamed: 0,A,B,C
0,1,2.0,b'Hello'
1,2,3.0,b'World'


In [29]:
pd.DataFrame(data, index=["first", "second"])

Unnamed: 0,A,B,C
first,1,2.0,b'Hello'
second,2,3.0,b'World'


In [32]:
pd.DataFrame(data, columns=["C", "A", "B"])

Unnamed: 0,C,A,B
0,b'Hello',1,2.0
1,b'World',2,3.0


In [33]:
#from a list of dicts.

data2 = [{"a":1, "b":2}, {"a":5, "b":10, "c":20}]

pd.DataFrame(data2)

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [34]:
pd.DataFrame(data2, index = ["first", "second"])

Unnamed: 0,a,b,c
first,1,2,
second,5,10,20.0


In [36]:
pd.DataFrame(data2, columns=["a", "b"])

Unnamed: 0,a,b
0,1,2
1,5,10


In [37]:
#from a dictionary of tuples.

pd.DataFrame(
    {
        ("a", "b"): {("A", "B"): 1, ("A", "C"): 2},
        ("a", "a"): {("A", "C"): 3, ("A", "B"): 4},
        ("a", "c"): {("A", "B"): 5, ("A", "C"): 6},
        ("b", "a"): {("A", "C"): 7, ("A", "B"): 8},
        ("b", "b"): {("A", "D"): 9, ("A", "B"): 10},
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


In [38]:
# from a list of namedtuples.

from collections import namedtuple

Point = namedtuple("Point", "x y")

pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)])

Unnamed: 0,x,y
0,0,0
1,0,3
2,2,3


In [41]:
Point3D = namedtuple("point3D", "x y z")

pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)])

Unnamed: 0,x,y,z
0,0,0,0.0
1,0,3,5.0
2,2,3,


In [47]:
#From a list of dataclasses
#All values in the list should be dataclasses, mixing types in the list would result in a TypeError.

from dataclasses import make_dataclass

Point = make_dataclass("Point", [("x", int), ("y", int)])

pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])


#Missing Data.
#Using np.nan to construct DataFrame with missing data and also
#Using numpy.MaskedArray.

Unnamed: 0,x,y
0,0,0
1,0,3
2,2,3


### Alternate Constructors

In [48]:
#dataFrame.from_dict() takes dict of dicts or a dict of array-like sequences and returns a dataframe.

pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]))

Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [49]:
# passing the 'orient="index"', the keys will be the row labels

pd.DataFrame.from_dict(
    dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]),
    orient="index",
    columns=["one", "two", "three"],
)

Unnamed: 0,one,two,three
A,1,2,3
B,4,5,6


In [50]:
#Dataframe.from_records() takes a list of tuples or an ndarray with structues dtype.
#The resulting index may be a specific field of the structured dtype.

data

array([(1, 2., b'Hello'), (2, 3., b'World')],
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [51]:
pd.DataFrame.from_records(data, index="C")

Unnamed: 0_level_0,A,B
C,Unnamed: 1_level_1,Unnamed: 2_level_1
b'Hello',1,2.0
b'World',2,3.0


## Column selction, addition, deletion.

In [52]:
df["one"]

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [53]:
df["three"] = df["one"] * df["two"]

df["flag"] = df["one"] > 2

df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [54]:
#columns can be deleted or popped like with a dict.

del df["two"]

three = df.pop("three")

df

Unnamed: 0,one,flag
a,1.0,False
b,2.0,False
c,3.0,True
d,,False


In [55]:
#when inserting a scalar value, it will naturally be propagated to fill the column.

df["foo"] = "bar"

df

Unnamed: 0,one,flag,foo
a,1.0,False,bar
b,2.0,False,bar
c,3.0,True,bar
d,,False,bar


In [56]:
#when inserting a series that does not have the same index as the dataframe, it will be conformed to the dataframes index:
df["one_trunc"] = df["one"][:2]

df

Unnamed: 0,one,flag,foo,one_trunc
a,1.0,False,bar,1.0
b,2.0,False,bar,2.0
c,3.0,True,bar,
d,,False,bar,


In [57]:
#inserting a acolumn at a particular location in the columns.
df.insert(1, "bar", df["one"])

df

Unnamed: 0,one,bar,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,2.0,False,bar,2.0
c,3.0,3.0,True,bar,
d,,,False,bar,


### Assigning New Columns in the Method Chains

In [None]:
iris = pd.read_csv("data/iris.data")
iris.head()

iris.assign(sepal_ration=iris["SepalWidth"] / iris["SepalLenth"]).head()

#assign always returns a copy of the data -  does nto overwrite the original data.

In [60]:
dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"])

Unnamed: 0,A,B,C,D
0,1,4,5,6
1,2,5,7,9
2,3,6,9,12


### Data Alignment and Arithmetic

In [61]:
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])

df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"])

df + df2

Unnamed: 0,A,B,C,D
0,1.137137,0.246449,-2.86561,
1,2.121082,-1.749484,-1.309332,
2,-0.890385,1.425191,3.626987,
3,-2.549004,0.608747,0.777934,
4,1.045384,0.492224,1.437955,
5,-0.380074,-0.95569,-0.510472,
6,0.0789,-0.818439,1.32481,
7,,,,
8,,,,
9,,,,


In [62]:
df - df.iloc[0]

Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,1.44899,-1.905221,1.030711,-1.146167
2,-1.365968,0.628691,3.417168,-0.206943
3,-2.372246,-1.333023,0.535925,-0.47334
4,0.180878,-1.328163,2.519658,-1.37083
5,-1.305837,-1.639347,0.268555,-0.834529
6,-1.042764,-1.042652,0.325732,-0.496415
7,-1.150808,-0.334523,1.452367,-0.731226
8,0.952467,-2.35844,0.497821,-1.158092
9,0.789616,-0.493567,1.732738,-2.911254


In [63]:
#arithmetic operations with scalars operate element-wise.

df * 5 + 2

Unnamed: 0,A,B,C,D
0,3.549534,5.799237,-1.8583,5.748434
1,10.794482,-3.726868,3.295254,0.017599
2,-3.280304,8.942694,15.227541,4.713716
3,-8.311698,-0.86588,0.821325,3.381734
4,4.453921,-0.841577,10.739988,-1.105718
5,-2.979651,-2.397498,-0.515528,1.575788
6,-1.664288,0.585977,-0.229638,3.266357
7,-2.204508,4.12662,5.403535,2.092304
8,8.31187,-5.992961,0.630805,-0.042025
9,7.497613,3.331402,6.805389,-8.807834


In [64]:
1 / df

Unnamed: 0,A,B,C,D
0,3.226778,1.316054,-1.295907,1.33389
1,0.568538,-0.873078,3.860246,-2.522195
2,-0.946915,0.720182,0.377999,1.842492
3,-0.484886,-1.744665,-4.242051,3.618641
4,2.037555,-1.759586,0.572083,-1.609934
5,-1.004086,-1.13701,-1.987655,-11.786548
6,-1.364522,-3.536009,-2.242517,3.948335
7,-1.1892,2.351149,1.469061,54.168741
8,0.792158,-0.62555,-3.65178,-2.44855
9,0.909486,3.755439,1.040499,-0.462627


In [65]:
df ** 4

Unnamed: 0,A,B,C,D
0,0.009224,0.333354,0.354572,0.315878
1,9.571082,1.721031,0.004503,0.02471073
2,1.243815,3.717338,48.981999,0.08677164
3,18.090093,0.107932,0.003088,0.005832005
4,0.058018,0.104318,9.336051,0.1488566
5,0.98382,0.598332,0.064067,5.181475e-05
6,0.288455,0.006397,0.039542,0.004114756
7,0.500012,0.032725,0.214704,1.161462e-07
8,2.539523,6.530567,0.005623,0.02782043
9,1.46156,0.005028,0.853167,21.83105


In [66]:
df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool)

df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool)

df1 & df2

Unnamed: 0,a,b
0,False,False
1,False,True
2,True,False


In [67]:
df1 | df2

Unnamed: 0,a,b
0,True,True
1,True,True
2,True,True


In [68]:
df1 ^ df2

Unnamed: 0,a,b
0,True,True
1,True,False
2,False,True


In [69]:
-df1

Unnamed: 0,a,b
0,False,True
1,True,False
2,False,False


### Transposing

In [70]:
df[:5]

Unnamed: 0,A,B,C,D
0,0.309907,0.759847,-0.77166,0.749687
1,1.758896,-1.145374,0.259051,-0.39648
2,-1.056061,1.388539,2.645508,0.542743
3,-2.06234,-0.573176,-0.235735,0.276347
4,0.490784,-0.568315,1.747998,-0.621144


In [71]:
df[:5].T

Unnamed: 0,0,1,2,3,4
A,0.309907,1.758896,-1.056061,-2.06234,0.490784
B,0.759847,-1.145374,1.388539,-0.573176,-0.568315
C,-0.77166,0.259051,2.645508,-0.235735,1.747998
D,0.749687,-0.39648,0.542743,0.276347,-0.621144


### DataFrame interoperability with Numpy Functions

In [72]:
np.exp(df)

Unnamed: 0,A,B,C,D
0,1.363298,2.13795,0.462245,2.116337
1,5.806026,0.318105,1.2957,0.672684
2,0.347823,4.008988,14.090605,1.720721
3,0.127156,0.563732,0.78999,1.318305
4,1.633597,0.566479,5.743091,0.53733
5,0.36938,0.414991,0.60465,0.918657
6,0.480534,0.753667,0.64023,1.288233
7,0.431321,1.530086,1.975274,1.018632
8,3.533801,0.202181,0.760454,0.66471
9,3.002732,1.305101,2.614513,0.115145


In [73]:
np.asarray(df)

array([[ 0.30990671,  0.75984734, -0.77166003,  0.74968674],
       [ 1.75889638, -1.14537353,  0.25905084, -0.39648012],
       [-1.05606085,  1.38853881,  2.64550828,  0.54274327],
       [-2.0623397 , -0.57317597, -0.23573502,  0.27634682],
       [ 0.49078426, -0.56831539,  1.7479975 , -0.62114361],
       [-0.99593016, -0.87949952, -0.50310552, -0.08484248],
       [-0.73285763, -0.28280469, -0.44592753,  0.25327132],
       [-0.84090154,  0.42532402,  0.68070691,  0.01846083],
       [ 1.26237399, -1.59859229, -0.27383909, -0.40840492],
       [ 1.09952269,  0.26628047,  0.96107778, -2.16156679]])

In [74]:
ser = pd.Series([1, 2, 3, 4])

np.exp(ser)

0     2.718282
1     7.389056
2    20.085537
3    54.598150
dtype: float64

In [75]:
ser1 = pd.Series([1, 2, 3], index=["a", "b", "c"])

ser2 = pd.Series([1, 3, 5], index=["b", "a", "c"])

ser1

a    1
b    2
c    3
dtype: int64

In [76]:
np.remainder(ser1, ser2)

a    1
b    0
c    3
dtype: int64

In [77]:
# when a binary ufunc is applied to a 'series' and 'Index', the series implementation
#takes precedence and a series is returned.

ser = pd.Series([1, 2, 3])

idx = pd.Index([4, 5, 6])

np.maximum(ser, idx)

0    4
1    5
2    6
dtype: int64

- You can change how much to print on a single row by setting the `display.width` option:

### DataFrame column attribute access and IPython completion.

In [78]:
df = pd.DataFrame({"foo1": np.random.randn(5), "foo2": np.random.randn(5)})

df

Unnamed: 0,foo1,foo2
0,1.909656,-1.028145
1,-1.028719,-0.015159
2,-1.355703,0.460909
3,-1.301039,0.35491
4,0.251816,-0.771943


In [79]:
df.foo1

0    1.909656
1   -1.028719
2   -1.355703
3   -1.301039
4    0.251816
Name: foo1, dtype: float64