In [None]:
import pandas as pd
data = pd.read_csv("api_data.csv")
df = pd.DataFrame(data)

def heuristic_score(row):
    score = 0
    
    mobile_devices = row["mobile_bluetooth_devices"].split(", ")
    if row["browser_device_name"] in mobile_devices:
        score += 50
    
    if row["mobile_ip"] == row["browser_ip"]:
        score += 30
    
    
    if row["mobile_location"] == row["mobile_region"]:
        score += 20
    
    
    mobile_topics = set(row["topics_mobile"].split(", "))
    browser_topics = set(row["topics_browser"].split(", "))
    topic_overlap = len(mobile_topics & browser_topics)
    score += topic_overlap * 5
    
    return score

df["heuristic_score"] = df.apply(heuristic_score, axis=1)

df["predicted_user"] = df["heuristic_score"].apply(lambda x: "Same User" if x >= 80 else "Different Users")

print(df[["mobile_location", "browser_device_name", "heuristic_score", "predicted_user", "expected_user"]])

accuracy = (df["predicted_user"] == df["expected_user"]).mean()
print(f"\nHeuristic Approach Accuracy: {accuracy:.2f}")

In [None]:
import pandas as pd

def calculate_topic_similarity(mobile_topics, browser_topics):
    """Calculate percentage similarity between mobile and browser topics."""
    mobile_set = set(mobile_topics.split(","))
    browser_set = set(browser_topics.split(","))
    intersection = mobile_set & browser_set
    similarity = len(intersection) / max(len(mobile_set), len(browser_set))
    return similarity


data['bluetooth_match'] = (data['mobile_bluetooth_devices'] == data['browser_device_name']).astype(int)
data['ip_match'] = (data['mobile_ip'] == data['browser_ip']).astype(int)
data['location_match'] = (data['mobile_location'] == data['browser_location']).astype(int)
data['topic_similarity'] = data.apply(
    lambda row: calculate_topic_similarity(row['mobile_topics'], row['browser_topics']), axis=1
)


def calculate_score(row):
    """Calculate a weighted score for device matching."""
    score = (
        row['bluetooth_match'] * 50 +  
        row['ip_match'] * 30 +        
        row['location_match'] * 15 + 
        row['topic_similarity'] * 5  
    )
    return score

data['match_score'] = data.apply(calculate_score, axis=1)

clusters = []
threshold = 70  

for i, device in data.iterrows():
    assigned = False
    for cluster in clusters:
        representative = cluster[0] 
        if calculate_score(device) >= threshold:
            cluster.append(device['device_id'])
            assigned = True
            break
    if not assigned:
        clusters.append([device['device_id']])


user_assignments = {}
for user_id, cluster in enumerate(clusters, start=1):
    for device_id in cluster:
        user_assignments[device_id] = f"user_{user_id}"

data['user_id'] = data['device_id'].map(user_assignments)


print("\nClustered Data with User IDs:")
print(data[['device_id', 'user_id', 'match_score']])

data.to_csv("clustered_device_data.csv", index=False)
print("\nClustered data saved to 'clustered_device_data.csv'")

import matplotlib.pyplot as plt
import seaborn as sns

clustered_data = data[['device_id', 'user_id', 'match_score']]
clustered_data['user_id'] = clustered_data['user_id'].astype('category')


plt.figure(figsize=(10, 6))
sns.scatterplot(x='device_id', y='match_score', hue='user_id', data=clustered_data, palette='Set1')

plt.title('Device Match Scores by User Cluster')
plt.xlabel('Device ID')
plt.ylabel('Match Score')
plt.xticks(rotation=45, ha='right')
plt.legend(title='User ID', loc='upper right')

plt.tight_layout()
plt.show()