In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import datetime

# Load datasets
raw_bill_data = pd.read_csv('./raw_bills.csv')
raw_communication_data = pd.read_csv('./raw_comm.csv')

# Select and rename columns
raw_communication_data = raw_communication_data[['content', 'pub_date']]
raw_communication_data = raw_communication_data.rename(columns={'content': 'title'})
raw_bill_data = raw_bill_data[['title', 'congress']]

# Define function to determine congress by date
def determine_congress_by_date(date):
    date = datetime.datetime.strptime(date, '%Y-%m-%d')
    congress_dates = {
        108: ('2003-01-03', '2005-01-03'),
        109: ('2005-01-03', '2007-01-03'),
        110: ('2007-01-03', '2009-01-03'),
        111: ('2009-01-03', '2011-01-03'),
        112: ('2011-01-03', '2013-01-03'),
        113: ('2013-01-03', '2015-01-03'),
        114: ('2015-01-03', '2017-01-03'),
        115: ('2017-01-03', '2019-01-03'),
        116: ('2019-01-03', '2021-01-03'),
        117: ('2021-01-03', '2023-01-03'),
        118: ('2023-01-03', '2025-01-03'),
    }

    for congress, (start, end) in congress_dates.items():
        start_date = datetime.datetime.strptime(start, '%Y-%m-%d')
        end_date = datetime.datetime.strptime(end, '%Y-%m-%d')
        if start_date <= date < end_date:
            return congress

    return None

# Add congress column to raw_communication_data
raw_communication_data['congress'] = raw_communication_data['pub_date'].apply(determine_congress_by_date)

# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define function to get embeddings
def get_embeddings(sentences, model):
    if isinstance(sentences, str):
        sentences = [sentences]
    elif isinstance(sentences, list) and all(isinstance(s, str) for s in sentences):
        pass
    else:
        raise ValueError("Input should be a list of sentences or a single sentence.")
    return model.encode(sentences)

# Generate embeddings
extracted_embeddings = get_embeddings(raw_communication_data['title'].tolist(), model)
existing_embeddings = get_embeddings(raw_bill_data['title'].tolist(), model)

# Function to find similarities
def find_similarities(extracted_embeddings, existing_embeddings, extracted_content, existing_bills, communication_congress, bill_congress, threshold=0.60):
    similarities = cosine_similarity(extracted_embeddings, existing_embeddings)
    matches = {}

    for i, extracted in enumerate(extracted_content):
        extracted_congress = communication_congress[i]
        for j, existing in enumerate(existing_bills):
            if extracted_congress == bill_congress[j]:
                similarity = similarities[i][j]
                if similarity > threshold:
                    if extracted not in matches:
                        matches[extracted] = []
                    matches[extracted].append((existing, similarity))

    return matches

# Extract congress data and content for matching
communication_congress = raw_communication_data['congress'].tolist()
bill_congress = raw_bill_data['congress'].tolist()
extracted_content = raw_communication_data['title'].tolist()
existing_bills = raw_bill_data['title'].tolist()

# Match communications with bills
matches = find_similarities(extracted_embeddings, existing_embeddings, extracted_content, existing_bills, communication_congress, bill_congress)

# Prepare data for CSV
output_data = []
for extracted, similar_bills in matches.items():
    if similar_bills:
        unique_bills = list(set(similar_bills))
        bills_with_scores = "; ".join([f"{bill} (similarity: {score:.2f})" for bill, score in unique_bills])
        output_data.append([extracted, bills_with_scores])
    else:
        output_data.append([extracted, "no match"])
        
for extracted in extracted_content:
    if extracted not in matches:
        output_data.append([extracted, "no match"])

# Convert to DataFrame
output_df = pd.DataFrame(output_data, columns=["Communication Content", "Bill Titles with Similarity Scores"])

# Save to CSV
output_df.to_csv('matched_communications_with_scores.csv', index=False)


In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import datetime

# Load datasets
raw_bill_data = pd.read_csv('./raw_bills.csv')
raw_communication_data = pd.read_csv('./raw_comm.csv')

# Select and rename columns
raw_communication_data = raw_communication_data[['comm_content_id', 'content', 'pub_date']]
raw_communication_data = raw_communication_data.rename(columns={'content': 'title'})
raw_bill_data = raw_bill_data[['bill_number', 'bill_type', 'title', 'congress']]

# Define function to determine congress by date
def determine_congress_by_date(date):
    date = datetime.datetime.strptime(date, '%Y-%m-%d')
    congress_dates = {
        108: ('2003-01-03', '2005-01-03'),
        109: ('2005-01-03', '2007-01-03'),
        110: ('2007-01-03', '2009-01-03'),
        111: ('2009-01-03', '2011-01-03'),
        112: ('2011-01-03', '2013-01-03'),
        113: ('2013-01-03', '2015-01-03'),
        114: ('2015-01-03', '2017-01-03'),
        115: ('2017-01-03', '2019-01-03'),
        116: ('2019-01-03', '2021-01-03'),
        117: ('2021-01-03', '2023-01-03'),
        118: ('2023-01-03', '2025-01-03'),
    }

    for congress, (start, end) in congress_dates.items():
        start_date = datetime.datetime.strptime(start, '%Y-%m-%d')
        end_date = datetime.datetime.strptime(end, '%Y-%m-%d')
        if start_date <= date < end_date:
            return congress

    return None

# Add congress column to raw_communication_data
raw_communication_data['congress'] = raw_communication_data['pub_date'].apply(determine_congress_by_date)

# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define function to get embeddings
def get_embeddings(sentences, model):
    if isinstance(sentences, str):
        sentences = [sentences]
    elif isinstance(sentences, list) and all(isinstance(s, str) for s in sentences):
        pass
    else:
        raise ValueError("Input should be a list of sentences or a single sentence.")
    return model.encode(sentences)

# Generate embeddings
extracted_embeddings = get_embeddings(raw_communication_data['title'].tolist(), model)
existing_embeddings = get_embeddings(raw_bill_data['title'].tolist(), model)

# Function to find similarities
def find_similarities(extracted_embeddings, existing_embeddings, extracted_content, communication_congress, bill_data, threshold=0.60):
    similarities = cosine_similarity(extracted_embeddings, existing_embeddings)
    matches = []

    for i, extracted in enumerate(extracted_content):
        extracted_congress = communication_congress[i]
        for j, bill in bill_data.iterrows():
            if extracted_congress == bill['congress']:
                similarity = similarities[i][j]
                if similarity > threshold:
                    matches.append({
                        'comm_content_id': raw_communication_data.iloc[i]['comm_content_id'],
                        'bill_number': bill['bill_number'],
                        'bill_type': bill['bill_type'],
                        'congress': extracted_congress,
                        'similarity': similarity
                    })

    return matches

# Extract congress data and content for matching
communication_congress = raw_communication_data['congress'].tolist()
extracted_content = raw_communication_data['title'].tolist()

# Match communications with bills
matches = find_similarities(extracted_embeddings, existing_embeddings, extracted_content, communication_congress, raw_bill_data)

# Prepare data for CSV
output_data = []
for match in matches:
    output_data.append([
        match['comm_content_id'],
        match['bill_number'],
        match['bill_type'],
        match['congress'],
        f"{match['similarity']:.2f}"
    ])

# Convert to DataFrame
output_df = pd.DataFrame(output_data, columns=["comm_content_id", "bill_number", "bill_type", "congress", "similarity"])

# Save to CSV
output_df.to_csv('matched_communications_with_scores_formated.csv', index=False)


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import datetime

# Load datasets
raw_bill_data = pd.read_csv('./raw_bills.csv')
raw_communication_data = pd.read_csv('./raw_comm.csv')

# Select and rename columns
raw_communication_data = raw_communication_data[['comm_content_id', 'content', 'pub_date']]
raw_communication_data = raw_communication_data.rename(columns={'content': 'title'})
raw_bill_data = raw_bill_data[['bill_number', 'bill_type', 'title', 'congress']]

# Define function to determine congress by date
def determine_congress_by_date(date):
    date = datetime.datetime.strptime(date, '%Y-%m-%d')
    congress_dates = {
        108: ('2003-01-03', '2005-01-03'),
        109: ('2005-01-03', '2007-01-03'),
        110: ('2007-01-03', '2009-01-03'),
        111: ('2009-01-03', '2011-01-03'),
        112: ('2011-01-03', '2013-01-03'),
        113: ('2013-01-03', '2015-01-03'),
        114: ('2015-01-03', '2017-01-03'),
        115: ('2017-01-03', '2019-01-03'),
        116: ('2019-01-03', '2021-01-03'),
        117: ('2021-01-03', '2023-01-03'),
        118: ('2023-01-03', '2025-01-03'),
    }

    for congress, (start, end) in congress_dates.items():
        start_date = datetime.datetime.strptime(start, '%Y-%m-%d')
        end_date = datetime.datetime.strptime(end, '%Y-%m-%d')
        if start_date <= date < end_date:
            return congress

    return None

# Add congress column to raw_communication_data
raw_communication_data['congress'] = raw_communication_data['pub_date'].apply(determine_congress_by_date)

# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define function to get embeddings
def get_embeddings(sentences, model):
    if isinstance(sentences, str):
        sentences = [sentences]
    elif isinstance(sentences, list) and all(isinstance(s, str) for s in sentences):
        pass
    else:
        raise ValueError("Input should be a list of sentences or a single sentence.")
    return model.encode(sentences)

# Generate embeddings
extracted_embeddings = get_embeddings(raw_communication_data['title'].tolist(), model)
existing_embeddings = get_embeddings(raw_bill_data['title'].tolist(), model)

# Function to find similarities
def find_similarities(extracted_embeddings, existing_embeddings, extracted_content, communication_congress, bill_data, threshold=0.60):
    similarities = cosine_similarity(extracted_embeddings, existing_embeddings)
    matches = []

    for i, extracted in enumerate(extracted_content):
        extracted_congress = communication_congress[i]
        matched_bills = []
        for j, bill in bill_data.iterrows():
            if extracted_congress == bill['congress']:
                similarity = similarities[i][j]
                if similarity > threshold:
                    matched_bills.append((bill['bill_number'], bill['bill_type'], similarity))
        
        if matched_bills:
            for bill_number, bill_type, similarity in matched_bills:
                matches.append({
                    'comm_content_id': raw_communication_data.iloc[i]['comm_content_id'],
                    'bill_number': bill_number,
                    'bill_type': bill_type,
                    'congress': extracted_congress,
                    'similarity': similarity
                })
        else:
            matches.append({
                'comm_content_id': raw_communication_data.iloc[i]['comm_content_id'],
                'bill_number': 'no match',
                'bill_type': 'no match',
                'congress': extracted_congress,
                'similarity': 'no match'
            })

    return matches

# Extract congress data and content for matching
communication_congress = raw_communication_data['congress'].tolist()
extracted_content = raw_communication_data['title'].tolist()

# Match communications with bills
matches = find_similarities(extracted_embeddings, existing_embeddings, extracted_content, communication_congress, raw_bill_data)

# Prepare data for CSV
output_data = []
for match in matches:
    output_data.append([
        match['comm_content_id'],
        match['bill_number'],
        match['bill_type'],
        match['congress'],
        f"{match['similarity']:.2f}" if match['similarity'] != 'no match' else 'no match'
    ])

# Convert to DataFrame
output_df = pd.DataFrame(output_data, columns=["comm_content_id", "bill_number", "bill_type", "congress", "similarity"])

# Save to CSV
output_df.to_csv('matched_communications_with_scores_final.csv', index=False)
