In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import HTML

### Select Operation (σ) - selection of rows (tuples)

In [2]:
data1 = {
    '𝐴': ['α', 'α', 'β', 'β'],
    '𝐵': ['α', 'β', 'β', 'β'],
    '𝐶': [1, 5, 12, 23],
    '𝐷': [7, 7, 3, 10]
}

# Create DataFrame
df1 = pd.DataFrame(data1)

# Display the DataFrame without the index
HTML(df1.to_html(index=False))

𝐴,𝐵,𝐶,𝐷
α,α,1,7
α,β,5,7
β,β,12,3
β,β,23,10


- $σ_{A=B\wedge D>5}(r)$

In [3]:
# Selection operation: A = B and D > 5
selected_df_1 = df1[(df1['𝐴'] == df1['𝐵']) & (df1['𝐷'] > 5)]

# Display the DataFrame without the index
HTML(selected_df_1.to_html(index=False))

𝐴,𝐵,𝐶,𝐷
α,α,1,7
β,β,23,10


- $ \sigma_{A=B\vee D>5}(r) $

In [4]:
# Selection operation: A = B OR D > 5
selected_df_2 = df1[(df1['𝐴'] == df1['𝐵']) | (df1['𝐷'] > 5)]

# Display the DataFrame without the index
HTML(selected_df_2.to_html(index=False))

𝐴,𝐵,𝐶,𝐷
α,α,1,7
α,β,5,7
β,β,12,3
β,β,23,10


### Project Operation ($\pi$) - selection of columns (attributes)

In [5]:
relation2 = {
    '𝐴': ['α', 'α', 'β', 'β'],
    '𝐵': [10, 20, 30, 40],
    '𝐶': [1, 1, 1, 2]
}

# Create DataFrame
df2 = pd.DataFrame(relation2)

# Display the DataFrame without the index
HTML(df2.to_html(index=False))

𝐴,𝐵,𝐶
α,10,1
α,20,1
β,30,1
β,40,2


- $\pi_{A,C}(r)$

In [6]:
# Projection: Select columns A and C
projected_df_1 = df2[['𝐴', '𝐶']]

# Display the projected DataFrame with duplicates
HTML(projected_df_1.to_html(index=False))

𝐴,𝐶
α,1
α,1
β,1
β,2


In [7]:
# Remove duplicate rows
unique_projected_df_1 = projected_df_1.drop_duplicates()

# Display the unique projected DataFrame
HTML(unique_projected_df_1.to_html(index=False))


𝐴,𝐶
α,1
β,1
β,2


### Union of two relations

In [8]:
r = {
    '𝐴': ['α', 'α', 'β'],
    '𝐵': [1, 2, 1]
}

# Create DataFrame
df_r = pd.DataFrame(r)

s = {
    '𝐴': ['α', 'β'],
    '𝐵': [2, 3]
}

# Create DataFrame
df_s = pd.DataFrame(s)

In [9]:
# Display the DataFrame without the index
HTML(df_r.to_html(index=False))

𝐴,𝐵
α,1
α,2
β,1


In [10]:
# Display the DataFrame without the index
HTML(df_s.to_html(index=False))

𝐴,𝐵
α,2
β,3


$r\cup s$

In [11]:
# Union of df_r and df_s
union_df = pd.concat([df_r, df_s]).drop_duplicates().reset_index(drop=True)

In [12]:
# Display the union DataFrame without the index
HTML(union_df.to_html(index=False))

𝐴,𝐵
α,1
α,2
β,1
β,3


### Set Difference of two relations

In [13]:
r = {
    '𝐴': ['α', 'α', 'β'],
    '𝐵': [1, 2, 1]
}

# Create DataFrame
df_r = pd.DataFrame(r)

s = {
    '𝐴': ['α', 'β'],
    '𝐵': [2, 3]
}

# Create DataFrame
df_s = pd.DataFrame(s)

$r-s$

In [14]:
# Set difference r - s
difference_df = df_r.merge(df_s, on=['𝐴', '𝐵'], how='left', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])

# Display the set difference DataFrame without the index
print("Set Difference r - s:")
display(HTML(difference_df.to_html(index=False)))

Set Difference r - s:


𝐴,𝐵
α,1
β,1


In [15]:
import pandas as pd
from IPython.display import HTML

# Relation r
r = {
    '𝐴': ['α', 'α', 'β'],
    '𝐵': [1, 2, 1]
}

# Create DataFrame for r
df_r = pd.DataFrame(r)

# Relation s
s = {
    '𝐴': ['α', 'β'],
    '𝐵': [2, 3]
}

# Create DataFrame for s
df_s = pd.DataFrame(s)

# Display the original DataFrames without the index
print("DataFrame r:")
display(HTML(df_r.to_html(index=False)))

print("DataFrame s:")
display(HTML(df_s.to_html(index=False)))

# Set difference r - s
difference_df = df_r.merge(df_s, on=['𝐴', '𝐵'], how='left', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])

# Display the set difference DataFrame without the index
print("Set Difference r - s:")
display(HTML(difference_df.to_html(index=False)))

DataFrame r:


𝐴,𝐵
α,1
α,2
β,1


DataFrame s:


𝐴,𝐵
α,2
β,3


Set Difference r - s:


𝐴,𝐵
α,1
β,1


### Set intersection of two relations

In [16]:
import pandas as pd
from IPython.display import HTML

# Relation r
r = {
    '𝐴': ['α', 'α', 'β'],
    '𝐵': [1, 2, 1]
}

# Create DataFrame for r
df_r = pd.DataFrame(r)

# Relation s
s = {
    '𝐴': ['α', 'β'],
    '𝐵': [2, 3]
}

# Create DataFrame for s
df_s = pd.DataFrame(s)

# Display the original DataFrames without the index
print("DataFrame r:")
display(HTML(df_r.to_html(index=False)))

print("DataFrame s:")
display(HTML(df_s.to_html(index=False)))

DataFrame r:


𝐴,𝐵
α,1
α,2
β,1


DataFrame s:


𝐴,𝐵
α,2
β,3


### $r \cap s$

In [17]:
# Intersection of df_r and df_s
intersection_df = pd.merge(df_r, df_s, on=['𝐴', '𝐵'], how='inner')

# Display the intersection DataFrame without the index
print("Intersection of r and s:")
display(HTML(intersection_df.to_html(index=False)))

Intersection of r and s:


𝐴,𝐵
α,2


**Note:** $r \cap s = r - (r - s)$

### Joining two relations - Cartesian Product

In [18]:
import pandas as pd
from IPython.display import HTML

# Relation r
r = {
    '𝐴': ['α', 'β'],
    '𝐵': [1, 2]
}

# Create DataFrame for r
df_r = pd.DataFrame(r)

# Relation s
s = {
    '𝐶': ['α', 'β', 'β', 'γ'],
    '𝐷': [10, 10, 20, 10],
    '𝐸': ['a', 'a', 'b', 'b']
}

# Create DataFrame for s
df_s = pd.DataFrame(s)

# Display the original DataFrames without the index
print("DataFrame r:")
display(HTML(df_r.to_html(index=False)))

print("DataFrame s:")
display(HTML(df_s.to_html(index=False)))

DataFrame r:


𝐴,𝐵
α,1
β,2


DataFrame s:


𝐶,𝐷,𝐸
α,10,a
β,10,a
β,20,b
γ,10,b


### $r \times s$

In [19]:
# Cartesian product of df_r and df_s
cartesian_product_df = pd.merge(df_r.assign(key=1), df_s.assign(key=1), on='key').drop('key', axis=1)

# Display the Cartesian product DataFrame without the index
print("Cartesian Product of r and s:")
display(HTML(cartesian_product_df.to_html(index=False)))

Cartesian Product of r and s:


𝐴,𝐵,𝐶,𝐷,𝐸
α,1,α,10,a
α,1,β,10,a
α,1,β,20,b
α,1,γ,10,b
β,2,α,10,a
β,2,β,10,a
β,2,β,20,b
β,2,γ,10,b


### Cartesian Product - naming issue

In [20]:
import pandas as pd
from IPython.display import HTML

# Relation r
r = {
    '𝐴': ['α', 'β'],
    '𝐵': [1, 2]
}

# Create DataFrame for r
df_r = pd.DataFrame(r)

# Relation s
s = {
    '𝐵': ['α', 'β', 'β', 'γ'],
    '𝐷': [10, 10, 20, 10],
    '𝐸': ['a', 'a', 'b', 'b']
}

# Create DataFrame for s
df_s = pd.DataFrame(s)

# Display the original DataFrames without the index
print("DataFrame r:")
display(HTML(df_r.to_html(index=False)))

print("DataFrame s:")
display(HTML(df_s.to_html(index=False)))

DataFrame r:


𝐴,𝐵
α,1
β,2


DataFrame s:


𝐵,𝐷,𝐸
α,10,a
β,10,a
β,20,b
γ,10,b


### $r \times s$

In [21]:
# Cartesian product of df_r and df_s
cartesian_product_df = pd.merge(df_r.assign(key=1), df_s.assign(key=1), on='key').drop('key', axis=1)

# Display the Cartesian product DataFrame without the index
print("Cartesian Product of r and s:")
display(HTML(cartesian_product_df.to_html(index=False)))

Cartesian Product of r and s:


𝐴,𝐵_x,𝐵_y,𝐷,𝐸
α,1,α,10,a
α,1,β,10,a
α,1,β,20,b
α,1,γ,10,b
β,2,α,10,a
β,2,β,10,a
β,2,β,20,b
β,2,γ,10,b


In [22]:
# Cartesian product of df_r and df_s
cartesian_product_df = pd.merge(df_r.assign(key=1), df_s.assign(key=1), on='key').drop('key', axis=1)

# Rename the columns to make them unique
cartesian_product_df.columns = ['𝐴', 'r.𝐵', 's.𝐵', '𝐷', '𝐸']

# Display the Cartesian product DataFrame without the index
print("Cartesian Product of r and s:")
display(HTML(cartesian_product_df.to_html(index=False)))

Cartesian Product of r and s:


𝐴,r.𝐵,s.𝐵,𝐷,𝐸
α,1,α,10,a
α,1,β,10,a
α,1,β,20,b
α,1,γ,10,b
β,2,α,10,a
β,2,β,10,a
β,2,β,20,b
β,2,γ,10,b


### Renaming a table
Allows us to refer to a relation, (say $E$) by more than one name.
$$\rho_{X}(E) $$
returns the expression $E$ under the name $X$

In [23]:
import pandas as pd
from IPython.display import HTML

# Relation r
r = {
    '𝐴': ['α', 'β'],
    '𝐵': [1, 2]
}

# Create DataFrame for r
df_r = pd.DataFrame(r)

# Display the original DataFrame without the index
print("DataFrame r:")

display(HTML(df_r.to_html(index=False)))

DataFrame r:


𝐴,𝐵
α,1
β,2


$r \times \rho_{s}(r)$

In [24]:
# Relation r's duplicate relation s
df_s = pd.DataFrame(r)
# Display the original DataFrame without the index
print("DataFrame s:")

display(HTML(df_s.to_html(index=False)))

DataFrame s:


𝐴,𝐵
α,1
β,2


In [25]:
# Cartesian product of df_r and df_s
cartesian_product_df = pd.merge(df_r.assign(key=1), df_s.assign(key=1), on='key').drop('key', axis=1)

# Rename the columns to make them unique
cartesian_product_df.columns = ['r.𝐴', 'r.𝐵', 's.𝐴','s.𝐵']

# Display the Cartesian product DataFrame without the index
print("Cartesian Product of r and s:")
display(HTML(cartesian_product_df.to_html(index=False)))

Cartesian Product of r and s:


r.𝐴,r.𝐵,s.𝐴,s.𝐵
α,1,α,1
α,1,β,2
β,2,α,1
β,2,β,2


### Composition of Operations

In [26]:
import pandas as pd
from IPython.display import HTML

# Relation r
r = {
    '𝐴': ['α', 'β'],
    '𝐵': [1, 2]
}

# Create DataFrame for r
df_r = pd.DataFrame(r)

# Relation s
s = {
    '𝐶': ['α', 'β', 'β', 'γ'],
    '𝐷': [10, 10, 20, 10],
    '𝐸': ['a', 'a', 'b', 'b']
}

# Create DataFrame for s
df_s = pd.DataFrame(s)

# Cartesian product of df_r and df_s
cartesian_product_df = pd.merge(df_r.assign(key=1), df_s.assign(key=1), on='key').drop('key', axis=1)

# Display the Cartesian product DataFrame without the index
print("Cartesian Product of r and s:")
display(HTML(cartesian_product_df.to_html(index=False)))

Cartesian Product of r and s:


𝐴,𝐵,𝐶,𝐷,𝐸
α,1,α,10,a
α,1,β,10,a
α,1,β,20,b
α,1,γ,10,b
β,2,α,10,a
β,2,β,10,a
β,2,β,20,b
β,2,γ,10,b


$\sigma_{A=C}(r \times s)$

In [27]:
cp_df = cartesian_product_df

# Selection operation: A = C
ans = cp_df[(cp_df['𝐴'] == cp_df['𝐶'])]

# Display the DataFrame without the index
HTML(ans.to_html(index=False))

𝐴,𝐵,𝐶,𝐷,𝐸
α,1,α,10,a
β,2,β,10,a
β,2,β,20,b


### Joining two relations - Natural Join

- Let $r$ and $s$ be relations on schemas $R$ and $S$ respectively. Then, the "**natural join**" of relations $R$ and $S$ is a relation on schema $R \cup S$ obtained as follows:
  - Consider each pair of tuples $t_{r}$ from $r$ and $t_{s}$ from $s$
  - If $t_{r}$ and $t_{s}$ have the same value on each of the attributes in $R \cap S$, add a tuple $t$ to the result, where
    - $t$ has the same value as $t_{r}$ on $r$
    - $t$ has the same value as $t_{s}$ on $s$

In [28]:
import pandas as pd
from IPython.display import HTML

# Relation r
r = {
    'A': ['α', 'β', 'γ', 'α', 'δ'],
    'B': [1, 2, 4, 1, 2],
    'C': ['α', 'γ', 'β', 'γ', 'β'],
    'D': ['a', 'a', 'b', 'a', 'b']
}

# Create DataFrame for r
df_r = pd.DataFrame(r)

# Relation s
s = {
    'B': [1, 3, 1, 2, 3],
    'D': ['a', 'a', 'a', 'b', 'b'],
    'E': ['α', 'β', 'γ', 'δ', 'ε']
}

# Create DataFrame for s
df_s = pd.DataFrame(s)

# Display the original DataFrames without the index
print("DataFrame r:")
display(HTML(df_r.to_html(index=False)))

print("DataFrame s:")
display(HTML(df_s.to_html(index=False)))

DataFrame r:


A,B,C,D
α,1,α,a
β,2,γ,a
γ,4,β,b
α,1,γ,a
δ,2,β,b


DataFrame s:


B,D,E
1,a,α
3,a,β
1,a,γ
2,b,δ
3,b,ε


### $r \bowtie s$

### $\pi_{A,r.B,C,r.D,E}(\sigma_{r.B=s.B \wedge r.D=s.D}(r \times s))$

In [29]:
# Natural join of df_r and df_s
natural_join_df = pd.merge(df_r, df_s, on=['B', 'D'])

# Display the result of the natural join
HTML(natural_join_df.to_html(index=False))

A,B,C,D,E
α,1,α,a,α
α,1,γ,a,α
α,1,γ,a,γ
α,1,α,a,γ
δ,2,β,b,δ


### Aggregation Operators

In [30]:
import pandas as pd
from IPython.display import HTML

# Relation r
r = {
    'A': ['a', 'b', 'c'],
    'B': [5, 2, 3]
}

# Create DataFrame for r
df_r = pd.DataFrame(r)

# Display the original DataFrame without the index
print("DataFrame r:")

display(HTML(df_r.to_html(index=False)))

DataFrame r:


A,B
a,5
b,2
c,3


### $\text{SUM}_{B}(\sigma_{B>2}(r))$

In [31]:
# Selection: B > 2
selected_df = df_r[df_r['B'] > 2]

# Aggregation: SUM(B)
sum_B = selected_df['B'].sum()

# Display the sum
print(sum_B)

8


# AQ2.2

## Question 2

In [32]:
import pandas as pd
from IPython.display import HTML

r = {
    'A':[1, 3, 1, 3],
    'B':[1, 4, 1, 2],
    'C':[3, 5, 2, 5],
    'D':[4, 1, 3, 1]
}

# Create DataFrame for r
df_r = pd.DataFrame(r)

# Display the original DataFrame without the index
print("Relation r:")

display(HTML(df_r.to_html(index=False)))

Relation r:


A,B,C,D
1,1,3,4
3,4,5,1
1,1,2,3
3,2,5,1


What will be the output of the following relational operation on r?
### $ \sigma_{\neg(A=B\vee C > 2)}(r) $

## Solution

To find the output of the relational operation $\sigma_{\neg(A=B \vee C > 2)}(r)$ on the given relation $r$, we need to filter the rows of the DataFrame where the condition $\neg(A=B \vee C > 2)$ holds true.

### Breaking Down the Condition

- $ A = B $: This means the values in column `A` should be equal to the values in column `B`.
- $ C > 2 $: This means the values in column `C` should be greater than 2.
- $ A = B \vee C > 2 $: This means either $ A = B $ or $ C > 2 $ (or both) should be true.
- $ \neg(A = B \vee C > 2) $: This negates the previous condition, meaning neither $ A = B $ nor $ C > 2 $ should be true.


### Explanation

- **Step 1**: Create the DataFrame `df_r`.
- **Step 2**: Apply the condition `~((df_r['A'] == df_r['B']) | (df_r['C'] > 2))`:
  - `df_r['A'] == df_r['B']` creates a boolean series where each element is `True` if the corresponding elements in columns `A` and `B` are equal, and `False` otherwise.
  - `df_r['C'] > 2` creates a boolean series where each element is `True` if the corresponding element in column `C` is greater than 2, and `False` otherwise.
  - The `|` operator performs an element-wise OR operation between these two boolean series.
  - The `~` operator negates the resulting boolean series.
- **Step 3**: Filter the DataFrame using this boolean series to keep only the rows where the condition holds true.
- **Step 4**: Display the resulting DataFrame.

### Expected Output

The output will be the rows of the DataFrame where neither `A = B` nor `C > 2` is true. Let's break it down for the given data:

- For the first row (index 0): $ A = 1, B = 1, C = 3 $. Here, $ A = B $ is true and $ C > 2 $ is true.
- For the second row (index 1): $ A = 3, B = 4, C = 5 $. Here, $ A = B $ is false but $ C > 2 $ is true.
- For the third row (index 2): $ A = 1, B = 1, C = 2 $. Here, $ A = B $ is true but $ C > 2 $ is false.
- For the fourth row (index 3): $ A = 3, B = 2, C = 5 $. Here, $ A = B $ is false but $ C > 2 $ is true.

None of these rows satisfy the condition $ \neg(A = B \vee C > 2) $, so the resulting DataFrame will be empty.

In [33]:
# Relational operation: ¬(A=B ∨ C > 2)
filtered_df = df_r[~((df_r['A'] == df_r['B']) | (df_r['C'] > 2))]

# Display the filtered DataFrame without the index
display(HTML(filtered_df.to_html(index=False)))

A,B,C,D


## Question 3

In [34]:
import pandas as pd
from IPython.display import HTML

r = {
    'A':[1, 3, 1, 3],
    'B':[1, 4, 1, 2],
    'C':[3, 5, 2, 5],
    'D':[4, 1, 3, 1]
}

# Create DataFrame for r
df_r = pd.DataFrame(r)

# Display the original DataFrame without the index
print("Relation r:")

display(HTML(df_r.to_html(index=False)))

Relation r:


A,B,C,D
1,1,3,4
3,4,5,1
1,1,2,3
3,2,5,1


How many rows will be there in the output of the following relational operation on r?
### $ \pi_{A,D}(r) $

In [35]:
# Projection: Select columns A and C
ans = df_r[['A', 'D']].drop_duplicates()

# Display the projected DataFrame with duplicates
HTML(ans.to_html(index=False))

A,D
1,4
3,1
1,3


## Solution: 3

## Question 6

In [36]:
import pandas as pd
from IPython.display import HTML

r = {
    'A':[0, 2, 1],
    'B':[9, 2, 1],
    'C':[0, 2, 1]
}

# Create DataFrame for r
df_r = pd.DataFrame(r)

s = {
    'A':[0, 2, 1],
    'B':['Cat', 'Dog', 'Lion'],
    'C':[0, 2, 1]
}

# Create DataFrame for s
df_s = pd.DataFrame(s)


# Display the original DataFrames without the index
print("DataFrame r:")
display(HTML(df_r.to_html(index=False)))

print("DataFrame s:")
display(HTML(df_s.to_html(index=False)))

DataFrame r:


A,B,C
0,9,0
2,2,2
1,1,1


DataFrame s:


A,B,C
0,Cat,0
2,Dog,2
1,Lion,1


How many columns and rows will be there in the Cartesian product of the above two relations?

## Solution

In [37]:
# Cartesian product of df_r and df_s
cartesian_product_df = pd.merge(df_r.assign(key=1), df_s.assign(key=1), on='key').drop('key', axis=1)

# Rename the columns to make them unique
cartesian_product_df.columns = ['r.A', 'r.B', 'r.C', 's.A','s.B', 's.C']

# Display the Cartesian product DataFrame without the index
print("Cartesian Product of r and s:")
display(HTML(cartesian_product_df.to_html(index=False)))

Cartesian Product of r and s:


r.A,r.B,r.C,s.A,s.B,s.C
0,9,0,0,Cat,0
0,9,0,2,Dog,2
0,9,0,1,Lion,1
2,2,2,0,Cat,0
2,2,2,2,Dog,2
2,2,2,1,Lion,1
1,1,1,0,Cat,0
1,1,1,2,Dog,2
1,1,1,1,Lion,1


In [38]:
cartesian_product_df.shape

(9, 6)

9 rows and 6 columns.

## Question 8

In [39]:
import pandas as pd
from IPython.display import HTML

r = {
    'A':['a1', 'a2', 'a3', 'a1', 'a2', 'a4'],
    'B':['b1', 'b2', 'b1', 'b3', 'b4', 'b1'],
    'C':['c1', 'c1', 'c3', 'c2', 'c2', 'a3']
}

# Create DataFrame for r
df_r = pd.DataFrame(r)

s = {
    'X': ['x1', 'x2', 'x3', 'x3', 'x1'],
    'Y': ['y1', 'y2', 'y3', 'y3', 'y4'],
    'A':['a1', 'a1', 'a2', 'a3', 'a3'],
    'C':['c1', 'c2', 'c2', 'c2', 'c3']
}

# Create DataFrame for s
df_s = pd.DataFrame(s)


# Display the original DataFrames without the index
print("DataFrame r:")
display(HTML(df_r.to_html(index=False)))

print("DataFrame s:")
display(HTML(df_s.to_html(index=False)))

DataFrame r:


A,B,C
a1,b1,c1
a2,b2,c1
a3,b1,c3
a1,b3,c2
a2,b4,c2
a4,b1,a3


DataFrame s:


X,Y,A,C
x1,y1,a1,c1
x2,y2,a1,c2
x3,y3,a2,c2
x3,y3,a3,c2
x1,y4,a3,c3


How many rows will be selected by relational operation $r \bowtie s$

## Solution

In [40]:
# Natural join of df_r and df_s
natural_join_df = pd.merge(df_r, df_s, on=['A', 'C'])

# Display the result of the natural join
print("Natural Join of r and s:")
display(HTML(natural_join_df.to_html(index=False)))

# Count the number of rows in the resulting DataFrame
num_rows = natural_join_df.shape[0]
print("Number of rows selected by r ⨝ s:", num_rows)

Natural Join of r and s:


A,B,C,X,Y
a1,b1,c1,x1,y1
a3,b1,c3,x1,y4
a1,b3,c2,x2,y2
a2,b4,c2,x3,y3


Number of rows selected by r ⨝ s: 4
