In [68]:
import pandas as pd
import numpy as np


In [93]:
amazon_df = pd.read_csv('Amazon.csv', encoding='ISO-8859-1',quotechar='"', escapechar='\\')
google_df = pd.read_csv('GoogleProducts.csv', encoding='ISO-8859-1',quotechar='"', escapechar='\\')
match_df = pd.read_csv('Amzon_GoogleProducts_perfectMapping.csv', encoding='ISO-8859-1',quotechar='"', escapechar='\\')
len(google_df['id'].unique())


3226

In [94]:
# Rename 'name' column in Google data to 'title' before merging
google_df_renamed = google_df.rename(columns={'name': 'title'})

# Rebuild merged table
google_df_renamed['source'] = 'google'
amazon_df['source'] = 'amazon'

# Reapply unified_id mapping to ensure it's consistent
google_df_renamed['unified_id'] = google_df_renamed['id'].map(id_to_rep).fillna(google_df_renamed['id'])
amazon_df['unified_id'] = amazon_df['id'].map(id_to_rep).fillna(amazon_df['id'])

# Merge with renamed columns
combined_df = pd.concat([amazon_df, google_df_renamed], ignore_index=True)

# Deduplicate with preference for Amazon if available
def select_representative(group):
    amazon_rows = group[group['source'] == 'amazon']
    if not amazon_rows.empty:
        return amazon_rows.iloc[0]
    else:
        return group.iloc[0]

deduplicated_df = combined_df.groupby('unified_id', as_index=False).apply(select_representative).reset_index(drop=True)

deduplicated_df.head()


Unnamed: 0,id,title,description,manufacturer,price,source,unified_id
0,1931102953,instant immersion american sign language win/mac,instant immersion asl teaches beginners americ...,topics entertainment,9.99,amazon,1931102953
1,b00001qgvr,art explosion 600 000 images,art explosion 600 000 is an enormous collectio...,nova development,79.99,amazon,b00001qgvr
2,b00001xdw7,jumpstart kindergarten,jumpstart kindergarten is designed to stimulat...,knowledge adventure,19.99,amazon,b00001xdw7
3,b00002cf7w,south park chef's luv shack,chef takes on the role of game show host in a ...,acclaim,39.95,amazon,b00002cf7w
4,b00002jvfk,microsoft entertainment pack ce,- marketing information: the microsoft enterta...,microsoft,49.5,amazon,b00002jvfk


In [95]:
deduplicated_df.to_csv("/Users/yolandazhou/Documents/untitled_folder/CSE_584/lotus/processed_result_3290.csv")

In [70]:
len(amazon_df)

1363

In [80]:
len(google_df)


KeyError: 'idGoogleBase'

In [72]:
len(match_df)

1300

In [73]:
len(match_df['idAmazon'].unique())

1113

In [74]:
google_df.columns

Index(['id', 'name', 'description', 'manufacturer', 'price'], dtype='object')

In [75]:
amazon_df.columns

Index(['id', 'title', 'description', 'manufacturer', 'price'], dtype='object')

# Filter Amazon dataset to include only rows in matching CSV
amazon_df = amazon_df[amazon_df['id'].isin(match_df['idAmazon'].unique())]
amazon_df.to_csv("/Users/yolandazhou/Documents/untitled_folder/CSE_584/lotus/amazon_df.csv")

# Filter Google dataset to include only rows in matching CSV
google_df = google_df[google_df['id'].isin(match_df['idGoogleBase'].unique())]
google_df.to_csv("/Users/yolandazhou/Documents/untitled_folder/CSE_584/lotus/google_df.csv")


In [77]:
len(amazon_df)
len(google_df)

1291

In [78]:
import networkx as nx

# Suppose we have a mapping df_map with columns [idAmazon, idGoogleBase]

# 1) Build a bipartite graph
B = nx.Graph()
amazon_nodes = match_df["idAmazon"].unique()
google_nodes = match_df["idGoogleBase"].unique()

B.add_nodes_from(amazon_nodes, bipartite=0)   # Amazon IDs
B.add_nodes_from(google_nodes, bipartite=1)  # Google IDs

# Add edges
for row in match_df.itertuples():
    B.add_edge(row.idAmazon, row.idGoogleBase)

# 2) Find connected components
connected_components = list(nx.connected_components(B))

# We'll create a dictionary that maps each node to a final representative Amazon ID
representative_map = {}

for component in connected_components:
    # Among the nodes in the connected component, find the Amazon IDs
    amazon_subnodes = [n for n in component if n in amazon_nodes]
    if not amazon_subnodes:
        # This situation is unlikely (would mean a component only of Google IDs),
        # but you could handle it differently if it arises
        continue

    # Pick one Amazon ID from the component to be the representative
    repr_amazon_id = amazon_subnodes[0]
    
    # Assign that representative ID to every node (Amazon or Google) in the component
    for n in component:
        representative_map[n] = repr_amazon_id

# 3) Now “lift” all Amazon and Google IDs to their representative Amazon ID
amazon_df["id"] = amazon_df["id"].map(representative_map)
google_df["id"] = google_df["id"].map(representative_map)

# 4) Rename 'google_df' column to "idAmazon" so both frames share a common ID column
#google_df.rename(columns={"idGoogleBase": "idAmazon"}, inplace=True)

# 5) Finally, union/concatenate
#df_final = pd.concat([amazon_df, google_df], ignore_index=True)
df_final = pd.merge(amazon_df, google_df, on="id", how="outer", suffixes=("_amazon", "_google"))
df_final

Unnamed: 0,id,title,description_amazon,manufacturer_amazon,price_amazon,name,description_google,manufacturer_google,price_google
0,b000jz4hqo,clickart 950 000 - premier image pack (dvd-rom),,broderbund,0.00,clickart 950000 - premier image pack (dvd-rom),massive collection of images & fonts for all y...,,48.95
1,b00004tkvy,noah's ark activity center (jewel case ages 3-8),,victory multimedia,0.00,the beginners bible: noah's ark activity cente...,,,9.95
2,b000g80lqo,peachtree by sage premium accounting for nonpr...,peachtree premium accounting for nonprofits 20...,sage software,599.99,sage (ptree) - vernfp2007rt - premium accounti...,if you're like most nonprofit organizations yo...,,590.35
3,b0006se5bq,singing coach unlimited,singing coach unlimited - electronic learning ...,carry-a-tune technologies,99.99,singing coach unlimited - electronic learning ...,learn to sing with the help of a patented real...,,82.5
4,b00021xhzw,adobe after effects professional 6.5 upgrade f...,upgrade only; installation of after effects st...,adobe,499.99,adobe software 22070152 after effects 6.5 pbupgrd,adobe after effects pb 6.5 win upgrade.standar...,,507
...,...,...,...,...,...,...,...,...,...
1295,b00005bigp,shapes,,school zone,9.99,school zone interactive shapes on track software,shapes challenges children to identify and cre...,,9.45
1296,b000h1df7w,dragon naturally speaking standard v9,dragon naturallyspeaking 9 (standard edition) ...,nuance communications inc.,99.99,nuance communications inc. dragon ns standard v9,dragon naturallyspeaking 9 allows people to pe...,,92.51
1297,b000p9cr66,mediarecover,mediarecover gives you the ability to recover ...,aladdin systems,29.99,allume systems inc mediarecover,mediarecover retrieves your lost photos audio ...,,26.14
1298,b000p9cr66,mediarecover,mediarecover gives you the ability to recover ...,aladdin systems,29.99,allume mediarecover,system requirements microsoft windows xp/vista...,,32.99


In [79]:
df_final.to_csv("/Users/yolandazhou/Documents/untitled_folder/CSE_584/lotus/processed_result_1300.csv")