In [2]:
from random import randint, choice

# Map Reduce

In the part of the assignment you are requested to use Map Reduce paradigm to solve the following exercises.

**NOTE THAT**: **A solution that does not use map reduce is not valid!**

# Exercise 1

You have a list of dictionaries, each representing a student with the following properties: a name and an array of test scores. Your task is to use map, filter, and reduce to calculate the average test score for each student, and then return a list of dictionaries containing only the students whose average score is above 90.

In [None]:
students = [
    {"name": "Alice", "scores": [95, 92, 88, 100]},
    {"name": "Bob", "scores": [78, 81, 85, 80]},
    {"name": "Charlie", "scores": [99, 91, 94, 96]},
    {"name": "Diana", "scores": [85, 87, 89, 83]}
]

Use `map`, `reduce` and `filter` that produce an output like:

In [None]:
[
    {"name": "Alice", "average_score": 93.75},
    {"name": "Charlie", "average_score": 95.0}
]

[{'name': 'Alice', 'average_score': 93.75},
 {'name': 'Charlie', 'average_score': 95.0}]

### Test
Test your solution using the dataset generated by the following function.

In [3]:
def generate_random_student_dataset(num_students=50):
    names = [f"Student {i}" for i in range(1, num_students + 1)]
    dataset = [
        {
            "name": name,
            "scores": [randint(50, 100) for _ in range(randint(3, 6))]  # Random scores between 50 and 100
        }
        for name in names
    ]
    return dataset

random_student_dataset = generate_random_student_dataset(50)
random_student_dataset[:3]

[{'name': 'Student 1', 'scores': [69, 50, 98, 54]},
 {'name': 'Student 2', 'scores': [69, 74, 71]},
 {'name': 'Student 3', 'scores': [89, 60, 88, 56, 63]}]

In [None]:
from functools import reduce

students = [
    {"name": "Alice", "scores": [95, 92, 88, 100]},
    {"name": "Bob", "scores": [78, 81, 85, 80]},
    {"name": "Charlie", "scores": [99, 91, 94, 96]},
    {"name": "Diana", "scores": [85, 87, 89, 83]}
]

# Step 1: Calculate the average score for each student
students_with_averages = list(map(
    lambda student: {
        "name": student["name"],
        "average_score": reduce(lambda a, b: a + b, student["scores"]) / len(student["scores"])
    },
    students
))

# Step 2: Filter out students with an average score <= 90
top_students = list(filter(
    lambda student: student["average_score"] > 90,
    students_with_averages
))

# Print the result
print(top_students)

## Exercise 2

You have a list of dictionaries, each representing a product with the following properties: name, price, and category. Using the functions `map`, `filter`, and `reduce`, calculate the average price of the products in each category and return a list of dictionaries containing only the categories where the average price exceeds 50.

Example input:

In [None]:
products = [
    {"name": "Product A", "price": 60, "category": "Electronics"},
    {"name": "Product B", "price": 40, "category": "Electronics"},
    {"name": "Product C", "price": 70, "category": "Home"},
    {"name": "Product D", "price": 30, "category": "Home"},
    {"name": "Product E", "price": 90, "category": "Sports"}
]

Use `map`, `reduce` and `filter` that produce an output like:

In [None]:
[
    {"category": "Electronics", "average_price": 50.0},
    {"category": "Sports", "average_price": 90.0}
]

[{'category': 'Electronics', 'average_price': 50.0},
 {'category': 'Sports', 'average_price': 90.0}]

### Test
Test your solution using the dataset generated by the following function.

In [4]:
def generate_random_product_dataset(num_products=100):
    categories = ["Electronics", "Home", "Sports", "Books", "Clothing", "Toys"]
    dataset = [
        {
            "name": f"Product {i}",
            "price": randint(10, 200),  # Random price between 10 and 200
            "category": choice(categories),  # Randomly choose a category
        }
        for i in range(1, num_products + 1)
    ]
    return dataset

# Example of using the function
random_dataset = generate_random_product_dataset(100)
random_dataset[:5]  # Display the first 5 entries to check the dataset structure


[{'name': 'Product 1', 'price': 112, 'category': 'Home'},
 {'name': 'Product 2', 'price': 139, 'category': 'Clothing'},
 {'name': 'Product 3', 'price': 168, 'category': 'Electronics'},
 {'name': 'Product 4', 'price': 137, 'category': 'Books'},
 {'name': 'Product 5', 'price': 97, 'category': 'Clothing'}]

In [None]:
from functools import reduce
from collections import defaultdict

# Input dataset
products = [
    {"name": "Product A", "price": 60, "category": "Electronics"},
    {"name": "Product B", "price": 40, "category": "Electronics"},
    {"name": "Product C", "price": 70, "category": "Home"},
    {"name": "Product D", "price": 30, "category": "Home"},
    {"name": "Product E", "price": 90, "category": "Sports"}
]

# Step 1: Group products by category
category_groups = defaultdict(list)
for product in products:
    category_groups[product["category"]].append(product["price"])

# Step 2: Calculate the average price for each category using map and reduce
category_averages = list(map(
    lambda item: {
        "category": item[0],
        "average_price": reduce(lambda a, b: a + b, item[1]) / len(item[1])
    },
    category_groups.items()
))

# Step 3: Filter categories where the average price is greater than 50
filtered_categories = list(filter(
    lambda category: category["average_price"] > 50,
    category_averages
))

# Print the result
print(filtered_categories)

# Exercise 3

You have a list of dictionaries, each representing an employee with the following properties: name, salary, and department. Your task is to use `map`, `filter`, and `reduce` to calculate the average salary for each department and return a list of dictionaries containing only the departments where the average salary is above 65,000.

**Example Input**

In [5]:
employees = [
    {"name": "John", "salary": 70000, "department": "Engineering"},
    {"name": "Jane", "salary": 75000, "department": "Engineering"},
    {"name": "Alice", "salary": 60000, "department": "HR"},
    {"name": "Bob", "salary": 68000, "department": "HR"},
    {"name": "Charlie", "salary": 90000, "department": "Marketing"},
    {"name": "Diana", "salary": 50000, "department": "Marketing"}
]

Use `map`, `reduce` and `filter` that produce an output like:

In [None]:
[
    {"department": "Engineering", "average_salary": 72500.0},
    {"department": "Marketing", "average_salary": 70000.0}
]

[{'department': 'Engineering', 'average_salary': 72500.0},
 {'department': 'Marketing', 'average_salary': 70000.0}]

### Test

Test your solution using the dataset generated by the following function.

In [6]:
def generate_random_employee_dataset(num_employees=50):
    departments = ["Engineering", "HR", "Marketing", "Sales", "Finance", "IT"]
    dataset = [
        {
            "name": f"Employee {i}",
            "salary": randint(40000, 120000),  
            "department": choice(departments)  # Randomly choose a department
        }
        for i in range(1, num_employees + 1)
    ]
    return dataset

random_employee_dataset = generate_random_employee_dataset(50)

random_employee_dataset[:3]  # Display the first 3 entries of each dataset for checking


[{'name': 'Employee 1', 'salary': 40737, 'department': 'Engineering'},
 {'name': 'Employee 2', 'salary': 77275, 'department': 'Engineering'},
 {'name': 'Employee 3', 'salary': 107774, 'department': 'IT'}]

In [None]:
from functools import reduce
from collections import defaultdict

# Input dataset
employees = [
    {"name": "John", "salary": 70000, "department": "Engineering"},
    {"name": "Jane", "salary": 75000, "department": "Engineering"},
    {"name": "Alice", "salary": 60000, "department": "HR"},
    {"name": "Bob", "salary": 68000, "department": "HR"},
    {"name": "Charlie", "salary": 90000, "department": "Marketing"},
    {"name": "Diana", "salary": 50000, "department": "Marketing"}
]

# Step 1: Group employees by department
department_groups = defaultdict(list)
for employee in employees:
    department_groups[employee["department"]].append(employee["salary"])

# Step 2: Calculate the average salary for each department using map and reduce
department_averages = list(map(
    lambda item: {
        "department": item[0],
        "average_salary": reduce(lambda a, b: a + b, item[1]) / len(item[1])
    },
    department_groups.items()
))

# Step 3: Filter departments where the average salary is greater than 65,000
filtered_departments = list(filter(
    lambda department: department["average_salary"] > 65000,
    department_averages
))

# Print the result
print(filtered_departments)

# Biopython

Write the following five functions to analyze global alignments between two sequences using Biopython's `pairwise2` module:

1. **countMatches(s1, s2)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment (pairwise2.globalxx) of the same length. It returns the number of positions where the elements of both sequences match.

2. **countMismatches(s1, s2)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment of the same length. It returns the number of positions where the elements of the two sequences are different (i.e., they are not gaps, and the characters do not match).

3. **countGapOpens(s1, s2)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment of the same length. It returns the number of gap openings in the alignment (a gap is opened when a '-' appears in the sequence).

4. **countGapExtensions(s1, s2)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment of the same length. It returns the number of gap extensions (where '-' continues in the alignment after an initial gap is opened).

5. **getScore(s1, s2, matchScore, mismatchPenalty, gapOpenPenalty, gapExtensionPenalty)**  
   This function takes two sequences (`s1`, `s2`) aligned using global alignment and returns the alignment score based on the provided scoring scheme: `matchScore` for matches, `mismatchPenalty` for mismatches, `gapOpenPenalty` for opening a gap, and `gapExtensionPenalty` for extending a gap.

In [None]:
from Bio import pairwise2

# Example sequences
s1 = "PLEASANTLY"
s2 = "MEANLY"

# Align the sequences globally using Biopython's pairwise2
alignments = pairwise2.align.globalxx(s1, s2)
# Get the best alignment (first one)
best_alignment = alignments[0]
aligned_s1, aligned_s2, score, begin, end = best_alignment

# Now use the functions on the aligned sequences
print("Aligned Sequences:", aligned_s1, aligned_s2)
print("Count Matches:", countMatches(aligned_s1, aligned_s2))
print("Count Mismatches:", countMismatches(aligned_s1, aligned_s2))
print("Count Gap Opens:", countGapOpens(aligned_s1, aligned_s2))
print("Count Gap Extensions:", countGapExtensions(aligned_s1, aligned_s2))

# Define scoring parameters
matchScore = 1
mismatchPenalty = -1
gapOpenPenalty = -2
gapExtensionPenalty = -1

print("Alignment Score:", getScore(aligned_s1, aligned_s2, matchScore, mismatchPenalty, gapOpenPenalty, gapExtensionPenalty))


### Test
Align the sequences of the [Interleukin-12](https://en.wikipedia.org/wiki/Interleukin_12) chain A (denoted as `s1`) from the file [`IL12A.fasta`](https://qcbsciprolab2020.readthedocs.io/en/latest/file_samples/IL12A.fasta) and the Interleukin-12 chain B (denoted as `s2`) from the file [`IL12B.fasta`](https://qcbsciprolab2020.readthedocs.io/en/latest/file_samples/IL12B.fasta) and check the score as computed from pairwise2 and from your functions.

In [None]:
from Bio import pairwise2

def countMatches(s1, s2):
    return sum(1 for a, b in zip(s1, s2) if a == b)

def countMismatches(s1, s2):
    return sum(1 for a, b in zip(s1, s2) if a != b and a != '-' and b != '-')

def countGapOpens(s1, s2):
    gap_opens = 0
    for i in range(1, len(s1)):
        if (s1[i] == '-' or s2[i] == '-') and (s1[i-1] != '-' and s2[i-1] != '-'):
            gap_opens += 1
    return gap_opens

def countGapExtensions(s1, s2):
    return sum(1 for a, b in zip(s1, s2) if a == '-' or b == '-')

def getScore(s1, s2, matchScore, mismatchPenalty, gapOpenPenalty, gapExtensionPenalty):
    score = 0
    gap_opened = False
    for a, b in zip(s1, s2):
        if a == '-' or b == '-':
            if not gap_opened:
                score += gapOpenPenalty
                gap_opened = True
            else:
                score += gapExtensionPenalty
        elif a == b:
            score += matchScore
        else:
            score += mismatchPenalty
    return score

with open('IL12A.fasta', 'r') as f1:
    s1 = f1.read().strip().splitlines()[1]

with open('IL12B.fasta', 'r') as f2:
    s2 = f2.read().strip().splitlines()[1]

alignments = pairwise2.align.globalxx(s1, s2)
best_alignment = alignments[0]
aligned_s1, aligned_s2, score, begin, end = best_alignment

print(f"Aligned Sequences:\n{aligned_s1}\n{aligned_s2}")
print("Matches:", countMatches(aligned_s1, aligned_s2))
print("Mismatches:", countMismatches(aligned_s1, aligned_s2))
print("Gap Opens:", countGapOpens(aligned_s1, aligned_s2))
print("Gap Extensions:", countGapExtensions(aligned_s1, aligned_s2))

matchScore = 1
mismatchPenalty = -1
gapOpenPenalty = -2
gapExtensionPenalty = -1

print("Score from Biopython:", score)
print("Score from Custom Function:", getScore(aligned_s1, aligned_s2, matchScore, mismatchPenalty, gapOpenPenalty, gapExtensionPenalty))
