In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. Load the dataset
df = pd.read_csv('updated_jabodetabeksur_olx_housing_dataset_.csv')

# 2. Preprocess descriptions (lowercase & fillna)
df['description'] = df['description'].fillna('').str.lower()
df['facilities'] = df['facilities'].fillna('').str.lower()
df['combined_text'] = df['description'] + ' ' + df['facilities']

# 3. TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# 4. Cosine similarity calculation
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 5. Function to get top-N similar properties
def export_similar_properties(property_index, top_n=5, file_format='json'):
    sim_scores = list(enumerate(cosine_sim_matrix[property_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, score in sim_scores[1:top_n+1]]  # skip itself
    
    results = df.iloc[top_indices][['title', 'facilities', 'description']].copy()
    results['similarity_score'] = [sim_scores[i+1][1] for i in range(top_n)]  # add scores
    
    # Export
    if file_format == 'csv':
        results.to_csv(f'similar_listings_{property_index}.csv', index=False)
        print(f'Exported to similar_listings_{property_index}.csv')
    elif file_format == 'json':
        results.to_json(f'similar_listings_{property_index}.json', orient='records', indent=2)
        print(f'Exported to similar_listings_{property_index}.json')
    else:
        print("Unsupported format. Use 'csv' or 'json'.")
    
    return results

# Example usage: get top 5 similar properties to listing at index 0
similar_props = export_similar_properties(4509, top_n=10)
print("Listing:")
print(df.iloc[5000][['title', 'facilities', 'description']])
print("\nTop 5 similar listings:")
print(similar_props[['title', 'facilities', 'description']])


Exported to similar_listings_4509.json
Listing:
title                        Take over/Over Kredit Rumah Kavling
facilities                                                      
description    ['cicilan 2jt flat sisa 54bulan', 'akses jalan...
Name: 5000, dtype: object

Top 5 similar listings:
                                                  title  \
4512        RUMAH SIAP HUNI DI GRAND WISATA KOTA BEKASI   
4564  DI JUAL CEPAT RUMAH BARU RENOVASI DI GRAND WIS...   
4758  Dijual Cepat Rumah Full Renovasi di Grand Wisa...   
4710  Di jual rumah mewah grand wisata kota bekasi a...   
4799  DIJUAL RUMAH FULL RENOV 2 LANTAI 6 KAMAR GRAND...   
4371    DI JUAL RUMAH DALAM CLUSTER GRAND WISATA BEKASI   
4382  DI JUAL RUMAH SIAP HUNI 2 LANTAI DI GRAND WISA...   
4700         Grand Wisata Rumah Siap Huni Di Jual Cepat   
4742  Grand Wisata Rumah Siap Huni Di Jual Cepat 2.5...   
4696         Grand Wisata Rumah Siap Huni Di Jual Cepat   

                                             facilities 

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. Load the dataset
df = pd.read_csv('housing_with_entities.csv')

# 2. Preprocess descriptions (lowercase & fillna)
df['description'] = df['description'].fillna('').str.lower()
df['facilities'] = df['facilities'].fillna('').str.lower()
df['SCHOOL'] = df['SCHOOL']
df['UNIVERSITY'] = df['UNIVERSITY']
df['HOSPITAL'] = df['HOSPITAL']
df['MALL'] = df['MALL']
df['MARKET'] = df['MARKET']
df['TRANSPORT'] = df['TRANSPORT']
df['WORSHIP'] = df['WORSHIP']
df['combined_text'] = df['description'] + ' ' + df['facilities'] + ' ' + df['SCHOOL'] + ' ' + df['UNIVERSITY'] + ' ' + df ['HOSPITAL'] + ' ' + df['MALL'] + ' ' + df['MARKET'] + ' ' + df['TRANSPORT'] + ' ' + df['WORSHIP']

# 3. TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# 4. Cosine similarity calculation
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 5. Function to get top-N similar properties
def export_similar_properties(property_index, top_n=5, file_format='json'):
    sim_scores = list(enumerate(cosine_sim_matrix[property_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, score in sim_scores[1:top_n+1]]  # skip itself
    
    results = df.iloc[top_indices][['title', 'facilities', 'description']].copy()
    results['similarity_score'] = [sim_scores[i+1][1] for i in range(top_n)]  # add scores
    
    # Export
    if file_format == 'csv':
        results.to_csv(f'similar_listings_{property_index}.csv', index=False)
        print(f'Exported to similar_listings_{property_index}.csv')
    elif file_format == 'json':
        results.to_json(f'similar_listings_{property_index}.json', orient='records', indent=2)
        print(f'Exported to similar_listings_{property_index}.json')
    else:
        print("Unsupported format. Use 'csv' or 'json'.")
    
    return results

# Example usage: get top 5 similar properties to listing at index 0
similar_props = export_similar_properties(4509, top_n=10)
print("Listing:")
print(df.iloc[5000][['title', 'facilities', 'description']])
print("\nTop 5 similar listings:")
print(similar_props[['title', 'facilities', 'description']])


TypeError: can only concatenate str (not "int") to str

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. Load the dataset
df = pd.read_csv('updated_jabodetabeksur_olx_housing_dataset_.csv')

# 2. Preprocess descriptions (lowercase & fillna)
df['description'] = df['description'].fillna('').str.lower()

# 3. TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df['description'])

# 4. Cosine similarity calculation
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 5. Function to get top-N similar properties
def export_similar_properties(property_index, top_n=5, file_format='json'):
    sim_scores = list(enumerate(cosine_sim_matrix[property_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, score in sim_scores[1:top_n+1]]  # skip itself
    
    results = df.iloc[top_indices][['title', 'description']].copy()
    results['similarity_score'] = [sim_scores[i+1][1] for i in range(top_n)]  # add scores
    
    # Export
    if file_format == 'csv':
        results.to_csv(f'similar_listings_{property_index}.csv', index=False)
        print(f'Exported to similar_listings_{property_index}.csv')
    elif file_format == 'json':
        results.to_json(f'similar_listings_{property_index}_nofacilities.json', orient='records', indent=2)
        print(f'Exported to similar_listings_{property_index}.json')
    else:
        print("Unsupported format. Use 'csv' or 'json'.")
    
    return results

# Example usage: get top 5 similar properties to listing at index 0
similar_props = export_similar_properties(4509, top_n=10)
print("Listing:")
print(df.iloc[5000][['title', 'description']])
print("\nTop 5 similar listings:")
print(similar_props[['title', 'description']])


Exported to similar_listings_4509.json
Listing:
title                        Take over/Over Kredit Rumah Kavling
description    ['cicilan 2jt flat sisa 54bulan', 'akses jalan...
Name: 5000, dtype: object

Top 5 similar listings:
                                                  title  \
4512        RUMAH SIAP HUNI DI GRAND WISATA KOTA BEKASI   
4758  Dijual Cepat Rumah Full Renovasi di Grand Wisa...   
4564  DI JUAL CEPAT RUMAH BARU RENOVASI DI GRAND WIS...   
4799  DIJUAL RUMAH FULL RENOV 2 LANTAI 6 KAMAR GRAND...   
4710  Di jual rumah mewah grand wisata kota bekasi a...   
4371    DI JUAL RUMAH DALAM CLUSTER GRAND WISATA BEKASI   
4382  DI JUAL RUMAH SIAP HUNI 2 LANTAI DI GRAND WISA...   
4700         Grand Wisata Rumah Siap Huni Di Jual Cepat   
4696         Grand Wisata Rumah Siap Huni Di Jual Cepat   
4695        Grand Wisata Rumah Di Jual Posisi Boulevard   

                                            description  
4512  ['di jual rumah mewah', 'grand wisata cluster ...  
4758 

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import hstack
import numpy as np

# Load both datasets
old_df = pd.read_csv("C:\Users\madea\OneDrive\Documents\Kuliah\Semester 8\Tugas Akhir\Coding\Data Preprocessing\updated_jabodetabeksur_olx_housing_dataset_.csv")
new_df = pd.read_csv("housing_with_entities.csv")

# Preprocess both datasets
def preprocess(df):
    df['description'] = df['description'].fillna('').str.lower()
    df['facilities'] = df['facilities'].fillna('').str.lower()
    df['combined_text'] = df['description'] + ' ' + df['facilities']
    return df

old_df = preprocess(old_df)
new_df = preprocess(new_df)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
old_tfidf = vectorizer.fit_transform(old_df['combined_text'])
new_tfidf = vectorizer.transform(new_df['combined_text'])  # use same vocab

# Normalize entity features for new_df
entity_features = ['SCHOOL', 'UNIVERSITY', 'HOSPITAL', 'MALL', 'MARKET', 'TRANSPORT', 'WORSHIP']
new_structured = new_df[entity_features].fillna(0).astype(int)
scaler = MinMaxScaler()
new_structured_scaled = scaler.fit_transform(new_structured)

# Combine TF-IDF and structured features
from scipy.sparse import csr_matrix
combined_matrix = hstack([new_tfidf, csr_matrix(new_structured_scaled)])

# Compute cosine similarity
old_sim_matrix = cosine_similarity(old_tfidf)
new_sim_matrix = cosine_similarity(combined_matrix)

# Comparison Function
def compare_recommendations(property_index, top_n=5):
    print(f"\n📌 Property Index: {property_index}\n")
    
    # Get similarity scores
    old_scores = list(enumerate(old_sim_matrix[property_index]))
    new_scores = list(enumerate(new_sim_matrix[property_index]))

    # Sort and remove self
    old_top = sorted(old_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    new_top = sorted(new_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

    # Display comparison
    print("🔷 Top Recommendations from OLD Dataset:")
    for i, score in old_top:
        print(f"- {old_df.iloc[i]['title']} (Score: {score:.4f})")

    print("\n🟢 Top Recommendations from NEW Dataset:")
    for i, score in new_top:
        print(f"- {new_df.iloc[i]['title']} (Score: {score:.4f})")

# Example: Compare top 5 recommendations for property at index 1000
compare_recommendations(property_index=1000, top_n=5)


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (1976635297.py, line 9)