# University of York: APPLIED ARTIFICIAL INTELLIGENCE
## Summative assignment
### **Source file 2**: AI search algorithm

This is the source file that includes the python script for Genetic search

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random, copy
import functools
from pandas import Series, DataFrame
import re

df = pd.read_csv('Final_combined_dataset.csv')

# Take a set of data to test
test = df[df['Countries, territories and areas'].isin(['Afghanistan','Peru'])]

'''
Extracts all the unique years from the dataset and sorts them. 
The sorted years are used to ensure that the year range for each individual (chromosome) is valid and consistent
'''
years = sorted(test['Year'].unique())

# Get list of unique countries and years
countries = df['Countries, territories and areas'].unique()
#countries = ['Afghanistan', 'Peru']
# Set parameters for genetic algorithm
'''
Genetic Algorithms Parameters
1. Population size = the number of individuals in each generation. Each inidividual represents a range of years that could be evaluated
2. Generations = the total number of iterations that the GA will run, evolving the population each time
3. Mutation rate = the probability of randomly changing part of an individual (year value) during the mutation step.
4. Tournament size = the number of random individuals selected from the population in each "tournament". The one with the highest fitness will be chosen to be a parent
'''

params = {
    "population_size" : 100,
    "generations" : 100,
    "mutation_rate" : 0.1,
    "tournament_size" : 50
}
population_size = params["population_size"]
generations = params["generations"]
mutation_rate = params["mutation_rate"]
tournament_size = params["tournament_size"]


numeric_columns = df.columns[2:]

def initialize_population():
    population = []
    for _ in range(population_size):
        y1, y2 = random.sample(years, 2)
        individual = [min(y1, y2), max(y1, y2)]
        population.append(individual)
    return population

fitness_cache = {}

def fitness(df, individual):
    if "-".join([str(x) for x in individual]) in fitness_cache.keys():
        return fitness_cache["-".join([str(x) for x in individual])]
    start_year, end_year = individual
    if start_year > end_year:
        return 0
    year_range = range(start_year, end_year + 1)
    score = 0
    # Select data from year range
    subset = df[df['Year'].isin(year_range)]
    for country in countries:
        country_data = subset[subset['Countries, territories and areas'] == country]
        # Count all non-null values
        score += country_data[numeric_columns].count().sum()
    fitness_cache["-".join([str(x) for x in individual])] = score
    return score

# Selection
def tournament_selection(population, fitness_scores):
    selected = []
    for _ in range(tournament_size):
        participants = random.sample(list(zip(population, fitness_scores)), tournament_size)
        winner = max(participants, key=lambda x: x[1])[0]
        selected.append(winner)
    return selected

# Crossover
def crossover(parent1, parent2):
    child1 = [parent1[0], parent2[1]]
    child2 = [parent2[0], parent1[1]]
    return [child1, child2]

# Mutation
def mutate(individual):
    if random.random() < mutation_rate:
        individual[0] = random.choice(years)
    if random.random() < mutation_rate:
        individual[1] = random.choice(years)
    individual.sort()
    return individual

def genetic_search(df):
    # Create population from data
    pop = initialize_population()
    # For each generation, produce new population
    print("********GENETIC SEARCH********")
    print("Parameters: ")
    for k, v in params.items():
        print(f"{k}: {v}")
    for gen in range(generations):
        print(f"Generation {gen}: Starting...")
        # Calculate fitness scores and normalise
        scores = [fitness(df, p) for p in pop]
        normal_scores = [
            (x - np.mean(scores)) / np.std(scores)
            for x in scores  
        ]
        print(f"Generation {gen}: Fitness calculated.")
        # Perform Tournament selection
        selected = tournament_selection(pop, normal_scores)
        print(f"Generation {gen}: Tournament selection complete.")
        # Crossover and mutate selection
        next_population = []
        for i in range(0, population_size, 2):
            parent1 = selected[i % len(selected)]
            parent2 = selected[(i+1) % len(selected)]
            offspring = crossover(parent1, parent2)
            next_population.extend(offspring)
        print(f"Generation {gen}: Crossover complete.")
        pop = [mutate(ind) for ind in next_population]
        print(f"Generation {gen}: Mutation complete.")
    return pop

# Call genetic search
pop = genetic_search(df)


********GENETIC SEARCH********
Parameters: 
population_size: 100
generations: 1000
mutation_rate: 0.1
tournament_size: 50
Generation 0: Starting...
Generation 0: Fitness calculated.
Generation 0: Tournament selection complete.
Generation 0: Crossover complete.
Generation 0: Mutation complete.
Generation 1: Starting...
Generation 1: Fitness calculated.
Generation 1: Tournament selection complete.
Generation 1: Crossover complete.
Generation 1: Mutation complete.
Generation 2: Starting...
Generation 2: Fitness calculated.
Generation 2: Tournament selection complete.
Generation 2: Crossover complete.
Generation 2: Mutation complete.
Generation 3: Starting...
Generation 3: Fitness calculated.
Generation 3: Tournament selection complete.
Generation 3: Crossover complete.
Generation 3: Mutation complete.
Generation 4: Starting...
Generation 4: Fitness calculated.
Generation 4: Tournament selection complete.
Generation 4: Crossover complete.
Generation 4: Mutation complete.
Generation 5: Star

In [2]:
# Evaluate results
fitness_scores = [fitness(df, ind) for ind in pop]
best_individual = pop[np.argmax(fitness_scores)]
best_fitness = max(fitness_scores)


In [6]:
print (best_individual, best_fitness)

[1952, 2021] 83015
