# Coding Practice Session 3
## Pandas DataFrames

In [33]:
import pandas as pd
import numpy as np

### Creating DataFrames

In [34]:
df = pd.DataFrame(
    {
        "First Column": pd.Series([1, 2, 3], index=["A", "B", "C"]),
        "Second Column": pd.Series([1, 2, 3, 4], index=["A", "B", "C", "D"], dtype="f4")
    }
)

In [35]:
df

Unnamed: 0,First Column,Second Column
A,1.0,1.0
B,2.0,2.0
C,3.0,3.0
D,,4.0


In [36]:
# union of indexes is used
df = pd.DataFrame(
    {
        "First Column": pd.Series([1, 2, 3]),
        "Second Column": pd.Series([1, 2, 3, 4], index=["A", "B", "C", "D"], dtype="f4")
    }
)
df

Unnamed: 0,First Column,Second Column
0,1.0,
1,2.0,
2,3.0,
A,,1.0
B,,2.0
C,,3.0
D,,4.0


In [37]:
data = {
    "One": [1, 2, 3, 4, 5],
    "Two": np.array([5, 4, 3, 2, 1], dtype="u1")
}

pd.DataFrame(data, index=range(80, 85))

Unnamed: 0,One,Two
80,1,5
81,2,4
82,3,3
83,4,2
84,5,1


In [38]:
data = [
    {
        "Name": "Arad",
        "Major": "Data Science",
        "Age": 23
    },
    {
        "Name": "Sarah",
        "Major": "Bio Technology",
        "Age": 21
    }
]

pd.DataFrame(data, index=[9823084, 9914016])

Unnamed: 0,Name,Major,Age
9823084,Arad,Data Science,23
9914016,Sarah,Bio Technology,21


In [39]:
grades = pd.Series([16.75, 18, 19.2], index=(10125, 45821, 74156), name="Grades")

pd.DataFrame(grades)

Unnamed: 0,Grades
10125,16.75
45821,18.0
74156,19.2


In [40]:
df = pd.DataFrame(
    {
        "Name": ["Paul", "Johnny", "Maxim", "Ringo"],
        "City": ["New York City", "London", "Sydney", "Warsaw"],
        "Age": (51, 49, 28, 63)
    }
)

df

Unnamed: 0,Name,City,Age
0,Paul,New York City,51
1,Johnny,London,49
2,Maxim,Sydney,28
3,Ringo,Warsaw,63


### DataFrame Attributes and Methods

In [41]:
df

Unnamed: 0,Name,City,Age
0,Paul,New York City,51
1,Johnny,London,49
2,Maxim,Sydney,28
3,Ringo,Warsaw,63


In [42]:
df.shape

(4, 3)

In [45]:
df.columns

Index(['Name', 'City', 'Age'], dtype='object')

In [46]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [47]:
df.dtypes

Name    object
City    object
Age      int64
dtype: object

In [48]:
type(df.dtypes)

pandas.core.series.Series

In [49]:
df.values

array([['Paul', 'New York City', 51],
       ['Johnny', 'London', 49],
       ['Maxim', 'Sydney', 28],
       ['Ringo', 'Warsaw', 63]], dtype=object)

In [50]:
type(df.values)

numpy.ndarray

In [51]:
df

Unnamed: 0,Name,City,Age
0,Paul,New York City,51
1,Johnny,London,49
2,Maxim,Sydney,28
3,Ringo,Warsaw,63


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   City    4 non-null      object
 2   Age     4 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 228.0+ bytes


In [None]:
df.describe()

Unnamed: 0,Age
count,4.0
mean,47.75
std,14.545904
min,28.0
25%,43.75
50%,50.0
75%,54.0
max,63.0


`df.describe` method works only on the columns with numeric data types (int, float, etc.)

In [57]:
df.head(2)

Unnamed: 0,Name,City,Age
0,Paul,New York City,51
1,Johnny,London,49


In [58]:
df.tail(2)

Unnamed: 0,Name,City,Age
2,Maxim,Sydney,28
3,Ringo,Warsaw,63


In [61]:
df.sort_values(by="Age", ascending=False)

Unnamed: 0,Name,City,Age
3,Ringo,Warsaw,63
0,Paul,New York City,51
1,Johnny,London,49
2,Maxim,Sydney,28


In [62]:
df.sort_values(by="Age")

Unnamed: 0,Name,City,Age
2,Maxim,Sydney,28
1,Johnny,London,49
0,Paul,New York City,51
3,Ringo,Warsaw,63


In [None]:
df.sort_values(by=["Age", "Name"])

Unnamed: 0,Name,City,Age
2,Maxim,Sydney,28
1,Johnny,London,49
0,Paul,New York City,51
3,Ringo,Warsaw,63


When two or more entries have equal values in the `Age` column it uses the second sort parameter, here `Name`, to sort values

In [64]:
df.sort_index()

Unnamed: 0,Name,City,Age
0,Paul,New York City,51
1,Johnny,London,49
2,Maxim,Sydney,28
3,Ringo,Warsaw,63


In [65]:
df.sort_index(ascending=False)

Unnamed: 0,Name,City,Age
3,Ringo,Warsaw,63
2,Maxim,Sydney,28
1,Johnny,London,49
0,Paul,New York City,51


In [66]:
df.transpose()

Unnamed: 0,0,1,2,3
Name,Paul,Johnny,Maxim,Ringo
City,New York City,London,Sydney,Warsaw
Age,51,49,28,63


In [67]:
df.T

Unnamed: 0,0,1,2,3
Name,Paul,Johnny,Maxim,Ringo
City,New York City,London,Sydney,Warsaw
Age,51,49,28,63


### Data Alignment and Arithmetic

In [78]:
df1 = pd.DataFrame(np.random.randint(1, 10, size=(5, 3)), columns=["A", "B", "C"])
df2 = pd.DataFrame(np.random.randint(100, 110, size=(5, 3)), columns=["B", "C", "D"])

In [79]:
df1

Unnamed: 0,A,B,C
0,2,8,8
1,5,2,8
2,5,5,7
3,6,5,7
4,4,4,6


In [80]:
df2

Unnamed: 0,B,C,D
0,108,108,104
1,108,107,109
2,101,108,100
3,105,107,101
4,108,109,106


In [82]:
result = df1 + df2
result

Unnamed: 0,A,B,C,D
0,,116,116,
1,,110,115,
2,,106,115,
3,,110,114,
4,,112,115,


In [83]:
result.shape

(5, 4)

In [85]:
result.size

20

In [88]:
dataframe_index = ["001", "002", "003", "004", "005"]

df = pd.DataFrame(np.random.randint(1, 10, size=(5, 3)), columns=["A", "B", "C"], index=dataframe_index)
s = pd.Series(np.random.rand(5))

In [89]:
df

Unnamed: 0,A,B,C
1,8,9,4
2,6,7,4
3,4,8,3
4,6,7,8
5,2,3,4


In [90]:
s

0    0.166165
1    0.771654
2    0.295425
3    0.050642
4    0.193502
dtype: float64

In [None]:
df + s 

Unnamed: 0,A,B,C,0,1,2,3,4
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,
4,,,,,,,,
5,,,,,,,,


**Note:** By default, when doing arithmetic operations between a pandas `DataFrame` and a pandas `Series`, the Series is aligned the DataFrame's columns, and the operation is performed *row-wise*.

This broadcasting rule ensures that every row in the DataFrame is used in the operation along with the provided Series

In [97]:
s = pd.Series([100, 200, 300], index=["A", "B", "C"])
s

A    100
B    200
C    300
dtype: int64

In [98]:
df

Unnamed: 0,A,B,C
1,8,9,4
2,6,7,4
3,4,8,3
4,6,7,8
5,2,3,4


In [99]:
df + s

Unnamed: 0,A,B,C
1,108,209,304
2,106,207,304
3,104,208,303
4,106,207,308
5,102,203,304


In [100]:
df

Unnamed: 0,A,B,C
1,8,9,4
2,6,7,4
3,4,8,3
4,6,7,8
5,2,3,4


In [101]:
s = pd.Series(range(100, 501, 100), index=dataframe_index)
s

001    100
002    200
003    300
004    400
005    500
dtype: int64

In [105]:
df.add(s) # default axis = 1 (columns)

Unnamed: 0,001,002,003,004,005,A,B,C
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,
4,,,,,,,,
5,,,,,,,,


In [106]:
df.add(s, axis=0)

Unnamed: 0,A,B,C
1,108,109,104
2,206,207,204
3,304,308,303
4,406,407,408
5,502,503,504


In [107]:
df

Unnamed: 0,A,B,C
1,8,9,4
2,6,7,4
3,4,8,3
4,6,7,8
5,2,3,4


In [108]:
df * 2 + 10

Unnamed: 0,A,B,C
1,26,28,18
2,22,24,18
3,18,26,16
4,22,24,26
5,14,16,18


In [109]:
s

001    100
002    200
003    300
004    400
005    500
dtype: int64

In [111]:
df.sub(s, axis=0)

Unnamed: 0,A,B,C
1,-92,-91,-96
2,-194,-193,-196
3,-296,-292,-297
4,-394,-393,-392
5,-498,-497,-496


In [112]:
df.mul(s, axis=0)

Unnamed: 0,A,B,C
1,800,900,400
2,1200,1400,800
3,1200,2400,900
4,2400,2800,3200
5,1000,1500,2000


In [113]:
df.div(s, axis=0)

Unnamed: 0,A,B,C
1,0.08,0.09,0.04
2,0.03,0.035,0.02
3,0.013333,0.026667,0.01
4,0.015,0.0175,0.02
5,0.004,0.006,0.008


In [126]:
s = pd.Series([100, 1000, 10_000, 100_000], index=["A", "B", "C", "D"])
s

A       100
B      1000
C     10000
D    100000
dtype: int64

In [128]:
df.add(s)

Unnamed: 0,A,B,C,D
1,108,1009,10004,
2,106,1007,10004,
3,104,1008,10003,
4,106,1007,10008,
5,102,1003,10004,


In [130]:
df.add(50, fill_value=0)

Unnamed: 0,A,B,C
1,58,59,54
2,56,57,54
3,54,58,53
4,56,57,58
5,52,53,54


In [131]:
df1 = pd.DataFrame(np.arange(1, 16).reshape(5, 3), columns=["A", "B", "C"])
df1

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9
3,10,11,12
4,13,14,15


In [132]:
df2 = pd.DataFrame(np.arange(100, 120).reshape(5, 4), columns=["A", "B", "C", "D"])
df2

Unnamed: 0,A,B,C,D
0,100,101,102,103
1,104,105,106,107
2,108,109,110,111
3,112,113,114,115
4,116,117,118,119


In [135]:
df2.add(df1, fill_value=np.inf)

Unnamed: 0,A,B,C,D
0,101,103,105,inf
1,108,110,112,inf
2,115,117,119,inf
3,122,124,126,inf
4,129,131,133,inf


### Console Display

In [136]:
df = pd.DataFrame(np.random.randn(1000, 1000))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,-0.991879,0.874851,0.950536,1.287747,0.326310,0.191069,0.288222,0.857928,0.767517,0.462953,...,0.293199,0.092435,0.612169,-0.812460,-0.399426,0.792419,-0.467311,-1.176932,0.167859,0.493647
1,1.819350,1.009346,-0.656039,-1.341767,0.552026,-0.070262,0.199665,0.790976,-0.842196,-0.100474,...,-0.747004,0.794093,0.284129,0.451173,0.485443,-0.068828,0.169856,0.880682,-2.400604,0.869795
2,0.732412,-1.581780,-0.923615,-0.673044,1.233117,-1.576989,1.001618,-0.800838,-1.333279,-0.597843,...,-0.473832,1.481313,-1.183039,0.111611,1.285637,0.473336,2.885651,-1.061079,-0.284020,0.612616
3,-0.236321,-0.211016,0.944126,-1.925505,-0.543368,0.383373,-0.432767,1.648049,0.180435,-0.030337,...,-0.693748,-0.875773,-0.009462,0.771498,-0.161243,0.076176,-0.933030,-0.992993,1.031073,0.770983
4,0.395213,1.507455,0.207294,-3.305689,0.282515,2.249488,0.570761,0.717294,0.938501,2.621905,...,-0.533477,-0.535340,-1.005245,-2.221018,0.548965,0.873393,1.403321,1.956948,-1.370226,0.111587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.362687,-0.585817,1.158266,-0.177258,0.181629,-2.081454,0.059457,-1.765267,0.037604,-0.150256,...,0.059498,0.571480,-0.129340,0.474235,-0.446969,-0.741261,0.776279,-1.997188,-0.403538,-1.019410
996,0.631025,0.471517,-2.312338,0.660772,0.202602,-0.964890,-0.679588,-2.166669,-0.190128,-0.229582,...,1.557795,0.038424,0.295131,-0.567979,-1.310262,0.053198,0.691434,1.615699,0.809014,0.752793
997,0.174902,1.634874,0.287707,1.608340,0.095584,-0.987692,0.280817,0.813182,0.130869,-0.332491,...,0.723573,0.448889,-0.013825,-0.206990,-1.735961,-0.380931,0.688983,0.452865,-2.177394,-0.169244
998,-0.499539,0.866577,-0.673320,0.565141,0.806074,-0.228926,1.145043,0.897423,-0.452511,0.682637,...,-1.195021,-0.248735,-1.008151,1.904634,1.367236,1.252180,0.781682,-1.716089,1.223919,-0.704619


In [137]:
pd.set_option("display.min_rows", 5)

In [138]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,-0.991879,0.874851,0.950536,1.287747,0.326310,0.191069,0.288222,0.857928,0.767517,0.462953,...,0.293199,0.092435,0.612169,-0.812460,-0.399426,0.792419,-0.467311,-1.176932,0.167859,0.493647
1,1.819350,1.009346,-0.656039,-1.341767,0.552026,-0.070262,0.199665,0.790976,-0.842196,-0.100474,...,-0.747004,0.794093,0.284129,0.451173,0.485443,-0.068828,0.169856,0.880682,-2.400604,0.869795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998,-0.499539,0.866577,-0.673320,0.565141,0.806074,-0.228926,1.145043,0.897423,-0.452511,0.682637,...,-1.195021,-0.248735,-1.008151,1.904634,1.367236,1.252180,0.781682,-1.716089,1.223919,-0.704619
999,1.167373,-0.868417,-1.588745,-0.250087,-0.016893,0.743450,0.246447,0.364786,-0.662273,-0.412145,...,0.828822,0.452372,-1.019836,-0.249966,1.635898,-0.390835,0.260215,-0.655332,-0.068717,0.294950


In [141]:
pd.set_option("display.max_columns", 10)

In [142]:
df

Unnamed: 0,0,1,2,3,4,...,995,996,997,998,999
0,-0.991879,0.874851,0.950536,1.287747,0.326310,...,0.792419,-0.467311,-1.176932,0.167859,0.493647
1,1.819350,1.009346,-0.656039,-1.341767,0.552026,...,-0.068828,0.169856,0.880682,-2.400604,0.869795
...,...,...,...,...,...,...,...,...,...,...,...
998,-0.499539,0.866577,-0.673320,0.565141,0.806074,...,1.252180,0.781682,-1.716089,1.223919,-0.704619
999,1.167373,-0.868417,-1.588745,-0.250087,-0.016893,...,-0.390835,0.260215,-0.655332,-0.068717,0.294950


In [143]:
pd.set_option("display.precision", 2)

In [144]:
df

Unnamed: 0,0,1,2,3,4,...,995,996,997,998,999
0,-0.99,0.87,0.95,1.29,0.33,...,0.79,-0.47,-1.18,0.17,0.49
1,1.82,1.01,-0.66,-1.34,0.55,...,-0.07,0.17,0.88,-2.40,0.87
...,...,...,...,...,...,...,...,...,...,...,...
998,-0.50,0.87,-0.67,0.57,0.81,...,1.25,0.78,-1.72,1.22,-0.70
999,1.17,-0.87,-1.59,-0.25,-0.02,...,-0.39,0.26,-0.66,-0.07,0.29


In [145]:
pd.reset_option("display.precision")
pd.reset_option("display.max_columns")
pd.reset_option("display.min_rows")
pd.reset_option("display.max_rows")

### Hiding or Showing Index and Column Labels

In [147]:
df = pd.DataFrame(np.arange(1, 16).reshape(5, 3), columns=["A", "B", "C"])
df

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9
3,10,11,12
4,13,14,15


In [150]:
print(df.to_string(index=False)) # hiding the index label

 A  B  C
 1  2  3
 4  5  6
 7  8  9
10 11 12
13 14 15


In [151]:
print(df.to_string(header=False)) # hiding column names

0   1   2   3
1   4   5   6
2   7   8   9
3  10  11  12
4  13  14  15


In [152]:
print(df.to_string(header=False, index=False))

 1  2  3
 4  5  6
 7  8  9
10 11 12
13 14 15
