#### **Python Review**

We will begin this tutorial analyzing the most fundamental **data structures** in the **field**. **Field** for us is the same as: artificial intelligence, machine learning, data science, data mining, and related.

Let us start with the **Pandas DataFrame**.


In [1]:
# As Pandas is a library, you are supposed to import it.
# For imports and why you are supposed to do it, visit: https://docs.python.org/3/reference/import.html
import pandas as pd
# Also import Numpy
import numpy as np

In [2]:
# DataFrame creation

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Set all necessary info

data = [        # This is your tabular data, a 3 x 3 matrix, encoded as a list os of lists
    [1, 2, 3],  # 1st list
    [4, 5, 6],  # 2nd list
    [7, 8, 9],  # 3rd list
]

columns = ['a', 'b', 'c'] # This is a list of letters or chars.

index = [1, 2, 3] # This is a list of integer numbers.

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Print

print(f'This is data: {data}\n')

# Print 'data' row by row
for i in range(len(data)):
    print(f'{data[i]} ---> row {i} of data')
print('')

# Print 'index'
print(f'This is index: {index}\n')

# Print 'columns'
print(f'This is columns: {columns}\n')

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Dataframe creation

df = pd.DataFrame(
    data = data,
    index = index,
    columns = columns,
)

print('This is the dataframe:')
display(df)

This is data: [[1, 2, 3], [4, 5, 6], [7, 8, 9]]

[1, 2, 3] ---> row 0 of data
[4, 5, 6] ---> row 1 of data
[7, 8, 9] ---> row 2 of data

This is index: [1, 2, 3]

This is columns: ['a', 'b', 'c']

This is the dataframe:


Unnamed: 0,a,b,c
1,1,2,3
2,4,5,6
3,7,8,9


In [3]:
# Dynamical DataFrame creation
# (all methods below are supposed to compose the most efficient ones in their own category)

# Parameters
n_rows = 3
n_cols = 3

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Dynamically create 'data'

data = np.arange(1,((n_rows*n_cols)+1)).reshape(n_rows,n_cols).tolist()
print(f'This is data: {data}\n')

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Dynamically create 'index'

# index = np.arange(1,n_rows+1)
index = ['row '+chr(97+i) for i in range(n_rows)]
print(f'This is index: {index}\n')

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Dynamically create 'columns'

columns = [chr(97+i) for i in range(n_rows)]
print(f'This is columns: {columns}\n')

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
# Dataframe creation

df = pd.DataFrame(
    data = data,
    index = index,
    columns = columns,
)

print('This is the dataframe:')
display(df)


This is data: [[1, 2, 3], [4, 5, 6], [7, 8, 9]]

This is index: ['row a', 'row b', 'row c']

This is columns: ['a', 'b', 'c']

This is the dataframe:


Unnamed: 0,a,b,c
row a,1,2,3
row b,4,5,6
row c,7,8,9


In [4]:
# One-shot code
# (for dynamical dataset creation)

n_rows = n_cols = 10
data = np.arange(1, (n_rows*n_cols)+1).reshape(n_rows,n_cols).tolist()
index = ['row '+str(i) for i in range(n_rows)]
columns = [f'col {i}' for i in range(n_cols)]

df = pd.DataFrame(
    data = data,
    index = index,
    columns = columns,
)
display(df)

Unnamed: 0,col 0,col 1,col 2,col 3,col 4,col 5,col 6,col 7,col 8,col 9
row 0,1,2,3,4,5,6,7,8,9,10
row 1,11,12,13,14,15,16,17,18,19,20
row 2,21,22,23,24,25,26,27,28,29,30
row 3,31,32,33,34,35,36,37,38,39,40
row 4,41,42,43,44,45,46,47,48,49,50
row 5,51,52,53,54,55,56,57,58,59,60
row 6,61,62,63,64,65,66,67,68,69,70
row 7,71,72,73,74,75,76,77,78,79,80
row 8,81,82,83,84,85,86,87,88,89,90
row 9,91,92,93,94,95,96,97,98,99,100


In [5]:
# Column-wise creation

df = pd.DataFrame()
n_rows = n_cols = 10
index = [f'row {i}' for i in range(n_cols)]
columns = [f'col {i}' for i in range(n_cols)]
# columns = ['col_'+chr(97+i) for i in range(n_cols)]

extra = 1
for i, column in enumerate(columns):
    df[column] = np.arange(extra + i, 100 + extra, 10)

df.index = index
display(df)

Unnamed: 0,col 0,col 1,col 2,col 3,col 4,col 5,col 6,col 7,col 8,col 9
row 0,1,2,3,4,5,6,7,8,9,10
row 1,11,12,13,14,15,16,17,18,19,20
row 2,21,22,23,24,25,26,27,28,29,30
row 3,31,32,33,34,35,36,37,38,39,40
row 4,41,42,43,44,45,46,47,48,49,50
row 5,51,52,53,54,55,56,57,58,59,60
row 6,61,62,63,64,65,66,67,68,69,70
row 7,71,72,73,74,75,76,77,78,79,80
row 8,81,82,83,84,85,86,87,88,89,90
row 9,91,92,93,94,95,96,97,98,99,100


In [6]:
# Time measurement
# (we will not this now, but you can for efficiency comparison of algorithms)

import time
start = time.time()
# Do something here...
end = time.time()
elapsed = end - start
print(elapsed)

5.745887756347656e-05


In [None]:
# We will move on with this df:
df

Unnamed: 0,col 0,col 1,col 2,col 3,col 4,col 5,col 6,col 7,col 8,col 9
row 0,1,2,3,4,5,6,7,8,9,10
row 1,11,12,13,14,15,16,17,18,19,20
row 2,21,22,23,24,25,26,27,28,29,30
row 3,31,32,33,34,35,36,37,38,39,40
row 4,41,42,43,44,45,46,47,48,49,50
row 5,51,52,53,54,55,56,57,58,59,60
row 6,61,62,63,64,65,66,67,68,69,70
row 7,71,72,73,74,75,76,77,78,79,80
row 8,81,82,83,84,85,86,87,88,89,90
row 9,91,92,93,94,95,96,97,98,99,100


As soon as you have it (a DataFrame), you are now ready to use it as the **main object** of your experiments. **Everything** you do in the *field* rounds the DataFrame object.

**Selection**

To select portions of a df is crucial!

You can do this way:

In [8]:
# Select the entire df
df.iloc[
    :, # select all rows
    :, # select all cols
]

Unnamed: 0,col 0,col 1,col 2,col 3,col 4,col 5,col 6,col 7,col 8,col 9
row 0,1,2,3,4,5,6,7,8,9,10
row 1,11,12,13,14,15,16,17,18,19,20
row 2,21,22,23,24,25,26,27,28,29,30
row 3,31,32,33,34,35,36,37,38,39,40
row 4,41,42,43,44,45,46,47,48,49,50
row 5,51,52,53,54,55,56,57,58,59,60
row 6,61,62,63,64,65,66,67,68,69,70
row 7,71,72,73,74,75,76,77,78,79,80
row 8,81,82,83,84,85,86,87,88,89,90
row 9,91,92,93,94,95,96,97,98,99,100


In [9]:
# ALTERNATIVE to Select the entire df
df.iloc[
    :
]

Unnamed: 0,col 0,col 1,col 2,col 3,col 4,col 5,col 6,col 7,col 8,col 9
row 0,1,2,3,4,5,6,7,8,9,10
row 1,11,12,13,14,15,16,17,18,19,20
row 2,21,22,23,24,25,26,27,28,29,30
row 3,31,32,33,34,35,36,37,38,39,40
row 4,41,42,43,44,45,46,47,48,49,50
row 5,51,52,53,54,55,56,57,58,59,60
row 6,61,62,63,64,65,66,67,68,69,70
row 7,71,72,73,74,75,76,77,78,79,80
row 8,81,82,83,84,85,86,87,88,89,90
row 9,91,92,93,94,95,96,97,98,99,100


In [10]:
# Select an exact element
df.iloc[
    0, # row to select
    0, # col to select
]

np.int64(1)

In [11]:
# Select a column (Series object)
df.iloc[
    :,
    0,
]

row 0     1
row 1    11
row 2    21
row 3    31
row 4    41
row 5    51
row 6    61
row 7    71
row 8    81
row 9    91
Name: col 0, dtype: int64

In [12]:
# Select a column (DataFrame object)
df.iloc[
    :,
    0:1,
]

Unnamed: 0,col 0
row 0,1
row 1,11
row 2,21
row 3,31
row 4,41
row 5,51
row 6,61
row 7,71
row 8,81
row 9,91


In [None]:
# Select a row (Series object)
df.iloc[
    0,
    :,
]

col 0     1
col 1     2
col 2     3
col 3     4
col 4     5
col 5     6
col 6     7
col 7     8
col 8     9
col 9    10
Name: row 0, dtype: int64

In [14]:
# Select a row (DataFrame object)
df.iloc[
    0:1,
    :,
]

Unnamed: 0,col 0,col 1,col 2,col 3,col 4,col 5,col 6,col 7,col 8,col 9
row 0,1,2,3,4,5,6,7,8,9,10


In [None]:
# select a portion of the df (DataFrame object)
df.iloc[
    4:7, # rows to select
    2:6, # cols to select
]

Unnamed: 0,col 2,col 3,col 4,col 5
row 4,43,44,45,46
row 5,53,54,55,56
row 6,63,64,65,66


In [None]:
# Select a portion of the df (Series object)
df.iloc[
    0, # rows to select
    2:6, # cols to select
]

col 2    3
col 3    4
col 4    5
col 5    6
Name: row 0, dtype: int64

In [None]:
# Select each element, one by one

display(df)

for row in df.to_numpy():
    for element in row:
        print(element)
    print(row)

# for i, j in zip(range(1,n_rows+1), range(1,n_rows+1)):
#     # print(i, j)
#     print(df.iloc[i:j])

Unnamed: 0,col 0,col 1,col 2,col 3,col 4,col 5,col 6,col 7,col 8,col 9
row 0,1,2,3,4,5,6,7,8,9,10
row 1,11,12,13,14,15,16,17,18,19,20
row 2,21,22,23,24,25,26,27,28,29,30
row 3,31,32,33,34,35,36,37,38,39,40
row 4,41,42,43,44,45,46,47,48,49,50
row 5,51,52,53,54,55,56,57,58,59,60
row 6,61,62,63,64,65,66,67,68,69,70
row 7,71,72,73,74,75,76,77,78,79,80
row 8,81,82,83,84,85,86,87,88,89,90
row 9,91,92,93,94,95,96,97,98,99,100


1
2
3
4
5
6
7
8
9
10
[ 1  2  3  4  5  6  7  8  9 10]
11
12
13
14
15
16
17
18
19
20
[11 12 13 14 15 16 17 18 19 20]
21
22
23
24
25
26
27
28
29
30
[21 22 23 24 25 26 27 28 29 30]
31
32
33
34
35
36
37
38
39
40
[31 32 33 34 35 36 37 38 39 40]
41
42
43
44
45
46
47
48
49
50
[41 42 43 44 45 46 47 48 49 50]
51
52
53
54
55
56
57
58
59
60
[51 52 53 54 55 56 57 58 59 60]
61
62
63
64
65
66
67
68
69
70
[61 62 63 64 65 66 67 68 69 70]
71
72
73
74
75
76
77
78
79
80
[71 72 73 74 75 76 77 78 79 80]
81
82
83
84
85
86
87
88
89
90
[81 82 83 84 85 86 87 88 89 90]
91
92
93
94
95
96
97
98
99
100
[ 91  92  93  94  95  96  97  98  99 100]
