In [None]:
# GMatches Code - Spacecrafts - Do Not Touch

import pandas as pd
from fuzzywuzzy import process

# Constants
SCORE_CUTOFF = 85

# Load the CSV files into pandas dataframes
scraper_data = pd.read_csv("G_SC.csv", encoding='UTF-8', on_bad_lines='skip')
spacecraft_data = pd.concat([pd.read_csv("SC_w_LC.csv", encoding='UTF-8', on_bad_lines='skip'),
                             pd.read_csv("SC_wo_LC.csv", encoding='UTF-8', on_bad_lines='skip')])

def get_spacecraft_names():
    """
    Returns a list of spacecraft names from the 'Spacecraft Name' column of the 'spacecraft_data' dataframe.
    """
    return spacecraft_data["Spacecraft Name"]

def name_comparison(scraper_name, spacecraft_names):
    """
    Uses fuzzywuzzy module to perform Levenshtein distance string comparison.
    Returns the best match from the 'spacecraft_names' list for the given 'scraper_name'.
    """
    result = process.extractOne(scraper_name, spacecraft_names, score_cutoff=SCORE_CUTOFF)
    if result:
        index = spacecraft_data.index[spacecraft_data['Spacecraft Name'] == result[0]][0]
        return {
            'Record ID#': str(spacecraft_data.iloc[index]["Record ID#"]).split('.')[0],
            'Spacecraft Name': str(spacecraft_data.iloc[index]["Spacecraft Name"]).split('.')[0],
            'Similarity %': result[1]
        }
    return None



def get_results():
    """
    Loops through the rows of the 'scraper_data' dataframe and performs name comparison for each row.
    Returns a list of dictionaries containing the matched results and the unmatched rows.
    """
    spacecraft_names = get_spacecraft_names()
    matched_results = []
    unmatched_rows = []
    for i, row in scraper_data.iterrows():
        result = name_comparison(row["Spacecraft Name"], spacecraft_names)
        if result:
            result.update({
                'GS Spacecraft': row["Spacecraft Name"],
                'Date': row["Date"],
                'Vehicle Name': row["Vehicle Name"],
                'Launch Site': row["Launch Site"],
                'Remark': row["Remark"],
                'Country': row["Country"],
                'Market Segment': row["Market Segment"],
                'Operator': row["Operator"],
                'Prime Manufacturer': row["Prime Manufacturer"],
                'Equipment': row["Equipment"],
                'Configuration': row["Configuration"],
                'Propulsion': row["Propulsion"],
                'Power': row["Power"],
                'Design Life': row["Design Life"],
                'Mass': row["Mass"],
                'Cospar': row["COSPAR"],
            })
            matched_results.append(result)
        else:
            unmatched_rows.append({
                'GS Spacecraft': row["Spacecraft Name"],
                'Date': row["Date"],
                'Vehicle Name': row["Vehicle Name"],
                'Launch Site': row["Launch Site"],
                'Remark': row["Remark"],
                'Country': row["Country"],
                'Market Segment': row["Market Segment"],
                'Operator': row["Operator"],
                'Prime Manufacturer': row["Prime Manufacturer"],
                'Equipment': row["Equipment"],
                'Configuration': row["Configuration"],
                'Propulsion': row["Propulsion"],
                'Power': row["Power"],
                'Design Life': row["Design Life"],
                'Mass': row["Mass"],
                'Cospar': row["COSPAR"],
            })

    # Convert the matched_results and unmatched_rows lists to pandas dataframes
    matched_df = pd.DataFrame(matched_results)
    unmatched_df = pd.DataFrame(unmatched_rows)

    # Strip the cell locations from Record ID# and Spacecraft Name columns in matched_df
    matched_df["Record ID#"] = matched_df["Record ID#"].str.split('.').str[0]
    matched_df["Spacecraft Name"] = matched_df["Spacecraft Name"].str.split('.').str[0]

    # Merge the matched and unmatched dataframes and save the results to a CSV file named 'matches.csv'
    result_df = pd.concat([matched_df, unmatched_df])
    result_df.to_csv("matches.csv", index=False)

    return result_df

def export_to_csv(dataframe, file_name):
    """
    Exports the given dataframe to a CSV file with the specified file name.
    """
    dataframe.to_csv(file_name, index=False)

# Call the get_results() function to generate the results dataframe
results_df = get_results()

# Export the results dataframe to a CSV file
export_to_csv(results_df, "SC_Matches.csv")



In [None]:
# GMatches Code - Launch Events - Do Not Touch

import pandas as pd
from fuzzywuzzy import process, fuzz
from datetime import datetime

# Constants
SCORE_CUTOFF = 80
SIMILARITY_THRESHOLD = 55

# Load the CSV files into pandas dataframes
g_le_df = pd.read_csv("G_LE.csv", encoding='UTF-8', on_bad_lines='skip')
b_le_df = pd.read_csv("B_LE.csv", encoding='UTF-8', on_bad_lines='skip')

# Define a function to compare a single row from 'g_le_df' with all rows in 'b_le_df'
def compare_row(row):
    date_match = None
    vehicle_match = None
    date_match_score = 0
    vehicle_match_score = 0
    g_date = datetime.strptime(row["Date"], "%m/%d/%Y")
    for _, b_row in b_le_df.iterrows():
        b_date = datetime.strptime(b_row["Launch Date"], "%m-%d-%Y")
        
        # Calculate the similarity scores for the vehicle name
        # date_similarity = process.extractOne(str(row["Date"]), str(b_row["Launch Date"]))
        vehicle_similarity = process.extractOne(str(row["Launch Vehicle"]), str(b_row["Vehicle Name"]))

        # Calculate the ratio of matching characters between the strings
        # date_ratio = fuzz.token_sort_ratio(str(row["Date"]), str(b_row["Launch Date"]))
        vehicle_ratio = fuzz.token_sort_ratio(str(row["Launch Vehicle"]), str(b_row["Vehicle Name"]))

        # If the similarity score is above the cutoff and the ratio of matching characters is high enough,
        # set the current row as the match
        if g_date.date() == b_date.date():
            date_match = b_row
            date_match_score = 100

        if vehicle_similarity and len(vehicle_similarity) > 0 and vehicle_similarity[1] > vehicle_match_score and vehicle_ratio >= SIMILARITY_THRESHOLD:
            vehicle_match = b_row
            vehicle_match_score = vehicle_ratio# vehicle_similarity[1]
    if date_match is not None and vehicle_match is not None:
        return (date_match, vehicle_match, date_match_score, vehicle_match_score)
    else:
        return None


# Loop through the rows of the 'g_le_df' dataframe and perform comparison for each row.
rows = []
for _, row in g_le_df.iterrows():
    match_result = compare_row(row)
    if match_result and match_result[2] >= SIMILARITY_THRESHOLD and match_result[3] >= SIMILARITY_THRESHOLD:
        date_match, vehicle_match, date_match_score, vehicle_match_score = match_result
        # Create a new row with the matched data
        new_row = {
            "Date": row["Date"],
            "Launch Vehicle": row["Launch Vehicle"],
            "Site": row["Site"],
            "Record ID#": date_match["Record ID#"],
            "Launch Date": date_match["Launch Date"],
            "Vehicle Name": vehicle_match["Vehicle Name"],
            "Date Similarity %": date_match_score,
            "Vehicle Similarity %": vehicle_match_score
        }
        rows.append(new_row)
    else:
        new_row = {
            "Date": row["Date"],
            "Launch Vehicle": row["Launch Vehicle"],
            "Site": row["Site"],
            "Record ID#": "",
            "Launch Date": "",
            "Vehicle Name": "",
            "Date Similarity %": "",
            "Vehicle Similarity %": ""
        }
        rows.append(new_row)

# Convert the results list to a pandas dataframe
result_df = pd.DataFrame(rows)

# Add a new column 'Match Status' to indicate whether a row is a match or not
result_df['Match Status'] = result_df.apply(lambda x: 'Match' if x['Record ID#'] else 'No Match', axis=1)

# print(result_df)

# Write the results to a new CSV file named 'G_LE_Matches.csv'
result_df.to_csv("LE_Matches.csv", index=False)
