# Red Thread Matchmaking Simulation
Our goal is to simulate dating dynamics as accurately as possible. We hypothesize that we can create an effective matchmaking algorithm by focusing on the following data: BMI, age, location, and gender. We will analyze our algorithm's "effectiveness" by analyzing the odds of getting a second date. 

## Python Setup
Run the following commands to get python setup correctly

In [None]:
!pip3 install numpy matplotlib geopandas contextily rtree geodatasets seaborn

## Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
from shapely.geometry import Point
import contextily as ctx
from geodatasets import get_path
from geopy.distance import great_circle

## Parameters

In [None]:
num_of_users = 1000
mean_age = 30
std_dev_age = 5
mean_bmi_men = 29.0
std_dev_bmi_men = 4.73
mean_bmi_women = 27.7
std_dev_bmi_women = 6.15
gender_distribution = [0.7, 0.3]

## Key Functions

### Calculate geographical closeness factor

In [None]:
def calculate_geographical_closeness(location1, location2):
    distance = great_circle((location1.y, location1.x), (location2.y, location2.x)).miles
    max_distance = 20  # Assume a max reasonable dating distance in NYC
    closeness_factor = 1 - min(distance, max_distance) / max_distance
    return closeness_factor

### Generate random points within a borough

In [None]:
def generate_random_points_within_borough(polygon, num_points):
    points = []
    minx, miny, maxx, maxy = polygon.bounds
    while len(points) < num_points:
        random_points = [Point(np.random.uniform(minx, maxx), np.random.uniform(miny, maxy)) for _ in range(num_points * 2)]
        for point in random_points:
            if polygon.contains(point):
                points.append(point)
                if len(points) >= num_points:
                    break
    return points

### Generate synthetic survey responses

In [None]:
def generate_survey_responses(odds):
    num_of_users = odds.shape[0]
    responses = np.zeros((num_of_users, num_of_users))
    for i in range(num_of_users):
        for j in range(i + 1, num_of_users):
            response = np.random.rand() < odds[i, j]
            responses[i, j] = responses[j, i] = response
    return responses

### BMI Classification

In [None]:
def classify_bmi(bmi):
    if bmi < 18.5:
        return 'underweight'
    elif 18.5 <= bmi < 25.0:
        return 'normal'
    elif 25.0 <= bmi < 30.0:
        return 'overweight'
    else:
        return 'obese'

### Calulate BMI compatibility factor

In [None]:
def bmi_compatibility(bmi1, bmi2):
    class1 = classify_bmi(bmi1)
    class2 = classify_bmi(bmi2)
    if class1 == class2:
        return 1.0  # Full compatibility if in the same category
    elif (class1 in ['underweight', 'normal'] and class2 in ['underweight', 'normal']) or \
         (class1 in ['overweight', 'obese'] and class2 in ['overweight', 'obese']):
        return 0.5  # Partial compatibility if in adjacent categories
    else:
        return 0.0  # No compatibility if in non-adjacent categories

### Simulate second date odds based on age, BMI, and location

In [None]:
def simulate_second_date_odds(ages, bmis, locations):
    num_of_users = len(ages)
    odds = np.zeros((num_of_users, num_of_users))

    for i in range(num_of_users):
        for j in range(i + 1, num_of_users):
            # Age factor: Higher compatibility for closer ages
            age_factor = max(0, 1 - abs(ages[i] - ages[j]) / 10)  # Normalize by 10 years, ensure non-negative

            # BMI factor: Compatibility based on WHO categories
            bmi_factor = bmi_compatibility(bmis[i], bmis[j])

            # Geographical closeness factor
            location_factor = calculate_geographical_closeness(locations.iloc[i], locations.iloc[j])

            # Combine factors to simulate second date odds
            odds[i, j] = odds[j, i] = age_factor * bmi_factor * location_factor
    
    return odds


### Generate random ages > 18

In [None]:
def generate_valid_ages(mean, std, num_users):
    ages = np.random.normal(mean, std, num_users)
    while any(ages < 18):
        ages[ages < 18] = np.random.normal(mean, std, np.sum(ages < 18))
    return ages

## Generate example data

In [None]:
ages = generate_valid_ages(mean_age, std_dev_age, num_of_users)
genders = np.random.choice(['Male', 'Female'], size=num_of_users, p=gender_distribution)

# Generate BMIs based on gender
bmis = np.array([np.random.normal(mean_bmi_men, std_dev_bmi_men) if gender == 'Male' else np.random.normal(mean_bmi_women, std_dev_bmi_women) for gender in genders])

# Generate geographical data (NYC boroughs)
nyc_land = gpd.read_file(get_path('nybb')).to_crs(epsg=4326)
population_densities = {
    'Manhattan': 72000,
    'Bronx': 33000,
    'Brooklyn': 38000,
    'Queens': 21000,
    'Staten Island': 8300
}
total_density = sum(population_densities.values())
borough_probs = {k: v / total_density for k, v in population_densities.items()}

random_points = []
for borough, prob in borough_probs.items():
    borough_polygon = nyc_land[nyc_land['BoroName'] == borough].geometry.unary_union
    num_points = int(prob * num_of_users)
    points = generate_random_points_within_borough(borough_polygon, num_points)
    while len(points) < num_points:
        additional_points = generate_random_points_within_borough(borough_polygon, num_points - len(points))
        points.extend(additional_points)
    random_points.extend(points)

## Data Preprocessing

### Ensure the length consistency

In [None]:
random_points = random_points[:num_of_users]
geo_df = gpd.GeoDataFrame({'geometry': random_points})
geo_df.crs = 'EPSG:4326'
locations = geo_df['geometry']

### Ensure all arrays are of the same length

In [None]:
min_length = min(len(ages), len(bmis), len(genders), len(locations))
ages = ages[:min_length]
bmis = bmis[:min_length]
genders = genders[:min_length]
locations = locations[:min_length]
print(f"Generated {min_length} users")

## Calculations

### Calculate second date odds

In [None]:
second_date_odds_matrix = simulate_second_date_odds(ages, bmis, locations)

### Calculate survey responses

In [None]:
survey_responses = generate_survey_responses(second_date_odds_matrix)

### Flatten the upper triangle of the matrixes for plotting

In [None]:
num_of_users = len(ages)
second_date_odds = second_date_odds_matrix[np.triu_indices(num_of_users, 1)]
survey_responses_flat = survey_responses[np.triu_indices(num_of_users, 1)]

### Calculate distances for location pairs

In [None]:
distances = [great_circle((locations.iloc[i].y, locations.iloc[i].x), (locations.iloc[j].y, locations.iloc[j].x)).miles for i in range(num_of_users) for j in range(i + 1, num_of_users)]

### Prepare data for scatter plots

In [None]:
ages_pairs = [(ages[i], ages[j]) for i in range(num_of_users) for j in range(i + 1, num_of_users)]
bmis_pairs = [(bmis[i], bmis[j]) for i in range(num_of_users) for j in range(i + 1, num_of_users)]

### Extract ages and BMIs for pairs

In [None]:
ages_diff = [abs(age1 - age2) for age1, age2 in ages_pairs]
bmis_diff = [abs(bmi1 - bmi2) for bmi1, bmi2 in bmis_pairs]

### Create buckets for visualization

In [None]:
age_buckets = np.digitize(ages_diff, bins=np.arange(0, 21, 5))  # Age difference buckets: 0-5, 5-10, 10-15, 15-20
bmi_buckets = np.digitize(bmis_diff, bins=np.arange(0, 21, 5))  # BMI difference buckets: 0-5, 5-10, 10-15, 15-20
distance_buckets = np.digitize(distances, bins=np.arange(0, 21, 5))  # Distance buckets: 0-5, 5-10, 10-15, 15-20

### Calculate average second date odds for each bucket

In [None]:
age_bucket_means = [np.mean(survey_responses_flat[age_buckets == i]) for i in range(1, 5)]
bmi_bucket_means = [np.mean(survey_responses_flat[bmi_buckets == i]) for i in range(1, 5)]
distance_bucket_means = [np.mean(survey_responses_flat[distance_buckets == i]) for i in range(1, 5)]

## Analysis

### Set the size of the plot

In [None]:
plt.figure(figsize=(15, 5))

### Age vs. Second Date Odds

In [None]:
plt.bar(np.arange(1, 5), age_bucket_means, tick_label=['0-5', '5-10', '10-15', '15-20'])
plt.title('Age Difference vs. Second Date Odds')
plt.xlabel('Age Difference (years)')
plt.ylabel('Average Second Date Odds')

### BMI vs. Second Date Odds

In [None]:
plt.bar(np.arange(1, 5), bmi_bucket_means, tick_label=['0-5', '5-10', '10-15', '15-20'])
plt.title('BMI Difference vs. Second Date Odds')
plt.xlabel('BMI Difference')
plt.ylabel('Average Second Date Odds')

### Distance vs. Second Date Odds

In [None]:
plt.bar(np.arange(1, 5), distance_bucket_means, tick_label=['0-5', '5-10', '10-15', '15-20'])
plt.title('Distance vs. Second Date Odds')
plt.xlabel('Distance (miles)')
plt.ylabel('Average Second Date Odds')

### Geographical Location

In [None]:
# Add gender information to the geo_df DataFrame
geo_df['gender'] = genders

# Plotting the geographical locations of users with different colors for men and women
plt.figure(figsize=(10, 10))

# Plot boundaries of NYC boroughs
ax = plt.gca()
nyc_land.boundary.plot(ax=ax, linewidth=1, color='black')

# Plot men and women in different colors
colors = {'Male': 'blue', 'Female': 'red'}
geo_df['color'] = geo_df['gender'].apply(lambda x: colors[x])
geo_df.plot(ax=ax, color=geo_df['color'], markersize=50, alpha=0.6)

# Add basemap
ctx.add_basemap(ax, crs='EPSG:4326', source=ctx.providers.CartoDB.Positron)

plt.title('Geographical Locations of Users')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=10, label='Male'),
                    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='Female')])

### Age distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(ages, bins=30, kde=True)
plt.title('User Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(True)

### Gender Distribution

In [None]:
gender_counts = geo_df['gender'].value_counts()
plt.figure(figsize=(8, 8))
plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=140, colors=['blue', 'red'], textprops={'fontsize': 14})
plt.title('User Gender Distribution', fontsize=16, pad=20) 
plt.axis('equal')
plt.show()

### BMI distributions for men and women

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(bmis[genders == 'Male'], bins=30, kde=True, color='blue', label='Male')
sns.histplot(bmis[genders == 'Female'], bins=30, kde=True, color='red', label='Female')
plt.title('BMI Distribution by Gender')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)