In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('../Data_Scraping/Data/Gurgaon/Societies/society.csv')

In [4]:
df.shape

(247, 7)

In [5]:
df.sample(1)

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities
125,Vatika Seven Elements,"2, 3, 4 BHK Apartment in Sector 89 A, Gurgaon","['Pataudi Road', 'Dwarka Expy', 'Newtown Squar...","{'Pataudi Road': '400 Meter', 'Dwarka Expy': '...",https://www.99acres.com/vatika-seven-elements-...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Valet Parking', 'Food Court', 'Swimming Pool..."


In [6]:
df.iloc[2].NearbyLocations

"['AIPL Business Club Sector 62', 'Heritage Xperiential Learning School', 'CK Birla Hospital', 'Paras Trinity Mall Sector 63', 'Rapid Metro Station Sector 56']"

In [7]:
df.iloc[2].LocationAdvantages

"{'AIPL Business Club Sector 62': '2.7 Km', 'Heritage Xperiential Learning School': '2 Km', 'CK Birla Hospital': '2.5 Km', 'Paras Trinity Mall Sector 63': '3.5 Km', 'Rapid Metro Station Sector 56': '3.8 Km', 'De Adventure Park': '6.8 Km', 'Golf Course Ext Rd': '99 Meter', 'DoubleTree by Hilton Hotel Gurgaon': '3.6 Km', 'KIIT College of Engineering Sohna Road': '8.4 Km', 'Mehrauli-Gurgaon Road': '11.8 Km', 'Indira Gandhi International Airport': '21.1 Km', 'Nirvana Rd': '160 Meter', 'TERI Golf Course': '8.7 Km'}"

In [8]:
df.iloc[1].PriceDetails

"{'3 BHK': {'building_type': 'Apartment', 'area_type': 'Super Built-up Area', 'area': '1,605 - 2,170 sq.ft.', 'price-range': '₹ 2.2 - 3.03 Cr'}, '4 BHK': {'building_type': 'Apartment', 'area_type': 'Super Built-up Area', 'area': '2,248 - 2,670 sq.ft.', 'price-range': '₹ 3.08 - 3.73 Cr'}}"

In [9]:
df[['PropertyName', 'TopFacilities']]['TopFacilities'][0]

"['Swimming Pool', 'Salon', 'Restaurant', 'Spa', 'Cafeteria', 'Sun Deck', '24x7 Security', 'Club House', 'Gated Community']"

In [10]:
def extract_list(s):
    return re.findall(r"'(.*?)'", s)

df['TopFacilities'] = df['TopFacilities'].apply(extract_list)

In [11]:
df['FacilitiesStr'] = df['TopFacilities'].apply(' '.join)

In [12]:
df['FacilitiesStr'][0]

'Swimming Pool Salon Restaurant Spa Cafeteria Sun Deck 24x7 Security Club House Gated Community'

In [13]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))

In [14]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df['FacilitiesStr'])

In [15]:
tfidf_matrix.toarray()[0]

array([0.        , 0.        , 0.        , 0.1881095 , 0.1881095 ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [16]:
cosine_sim1 = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [17]:
cosine_sim1 = pd.DataFrame(cosine_sim1)

In [18]:
cosine_sim1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,237,238,239,240,241,242,243,244,245,246
0,1.000000,0.011003,0.000000,0.010892,0.036872,0.096109,0.013048,0.117032,0.085912,0.183894,...,0.244485,0.031123,0.027175,0.014947,0.088334,0.035424,0.195297,0.011888,0.086677,0.011124
1,0.011003,1.000000,0.019855,0.129645,0.159041,0.019323,0.217126,0.008440,0.041371,0.080002,...,0.010039,0.029512,0.028553,0.087772,0.018442,0.009756,0.022372,0.119139,0.015627,0.009684
2,0.000000,0.019855,1.000000,0.012230,0.010525,0.096805,0.026423,0.104558,0.076642,0.028967,...,0.012949,0.058460,0.014037,0.012622,0.019822,0.057596,0.010714,0.070281,0.038281,0.019641
3,0.010892,0.129645,0.012230,1.000000,0.010052,0.077405,0.087486,0.063468,0.071565,0.201630,...,0.027011,0.042006,0.166237,0.019403,0.102801,0.036337,0.010689,0.118387,0.086986,0.158206
4,0.036872,0.159041,0.010525,0.010052,1.000000,0.073609,0.095324,0.008948,0.112626,0.108730,...,0.010643,0.010172,0.131065,0.290720,0.009776,0.065283,0.011859,0.030785,0.016567,0.010266
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,0.035424,0.009756,0.057596,0.036337,0.065283,0.000000,0.084938,0.075875,0.010360,0.058313,...,0.075770,0.094456,0.032017,0.051370,0.059543,1.000000,0.060398,0.035031,0.074889,0.086383
243,0.195297,0.022372,0.010714,0.010689,0.011859,0.000000,0.013265,0.153088,0.011879,0.023385,...,0.298856,0.018884,0.011471,0.000000,0.534106,0.060398,1.000000,0.000000,0.000000,0.010917
244,0.011888,0.119139,0.070281,0.118387,0.030785,0.000000,0.178157,0.028049,0.120766,0.094061,...,0.018150,0.165701,0.050704,0.185318,0.006199,0.035031,0.000000,1.000000,0.098420,0.032618
245,0.086677,0.015627,0.038281,0.086986,0.016567,0.045070,0.033571,0.013771,0.080129,0.149823,...,0.074179,0.261743,0.065871,0.038439,0.067542,0.074889,0.000000,0.098420,1.000000,0.062688


In [19]:
cosine_sim1 = cosine_sim1.drop(index=[246], columns=[246])

In [20]:
cosine_sim1.shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
def recommend_properties(property_name, cosine_sim=cosine_sim1):
    # Get the index of the property that matches the name
    idx = df.index[df['PropertyName'] == property_name].tolist()[0]

    # Get the pairwise similarity scores with that property
    sim_scores = list(enumerate(cosine_sim1[idx]))

    # Sort the properties based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar properties
    sim_scores = sim_scores[1:6]

    # Get the property indices
    property_indices = [i[0] for i in sim_scores]
    
    recommendations_df = pd.DataFrame({
        'PropertyName': df['PropertyName'].iloc[property_indices],
        'SimilarityScore': sim_scores
    })

    return recommendations_df

In [None]:
recommend_properties("DLF The Arbour")

In [None]:
df[['PropertyName', 'PriceDetails']]

In [None]:
df.drop(index=[22], inplace=True)

In [None]:
# Making recommendations on PriceDetails

import pandas as pd
import json

# Load the dataset
df_appartments = df.copy()

# Function to parse and extract the required features from the PriceDetails column
def refined_parse_modified_v2(detail_str):
    try:
        details = json.loads(detail_str.replace("'", "\""))
    except:
        return {}

    extracted = {}
    for bhk, detail in details.items():
        # Extract building type
        extracted[f'building type_{bhk}'] = detail.get('building_type')

        # Parsing area details
        area = detail.get('area', '')
        area_parts = area.split('-')
        if len(area_parts) == 1:
            try:
                value = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area low {bhk}'] = value
                extracted[f'area high {bhk}'] = value
            except:
                extracted[f'area low {bhk}'] = None
                extracted[f'area high {bhk}'] = None
        elif len(area_parts) == 2:
            try:
                extracted[f'area low {bhk}'] = float(area_parts[0].replace(',', '').replace(' sq.ft.', '').strip())
                extracted[f'area high {bhk}'] = float(area_parts[1].replace(',', '').replace(' sq.ft.', '').strip())
            except:
                extracted[f'area low {bhk}'] = None
                extracted[f'area high {bhk}'] = None

        # Parsing price details
        price_range = detail.get('price-range', '')
        price_parts = price_range.split('-')
        if len(price_parts) == 2:
            try:
                extracted[f'price low {bhk}'] = float(price_parts[0].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                extracted[f'price high {bhk}'] = float(price_parts[1].replace('₹', '').replace(' Cr', '').replace(' L', '').strip())
                if 'L' in price_parts[0]:
                    extracted[f'price low {bhk}'] /= 100
                if 'L' in price_parts[1]:
                    extracted[f'price high {bhk}'] /= 100
            except:
                extracted[f'price low {bhk}'] = None
                extracted[f'price high {bhk}'] = None

    return extracted
# Apply the refined parsing and generate the new DataFrame structure
data_refined = []

for _, row in df_appartments.iterrows():
    features = refined_parse_modified_v2(row['PriceDetails'])
    
    # Construct a new row for the transformed dataframe
    new_row = {'PropertyName': row['PropertyName']}
    
    # Populate the new row with extracted features
    for config in ['1 BHK', '2 BHK', '3 BHK', '4 BHK', '5 BHK', '6 BHK', '1 RK', 'Land']:
        new_row[f'building type_{config}'] = features.get(f'building type_{config}')
        new_row[f'area low {config}'] = features.get(f'area low {config}')
        new_row[f'area high {config}'] = features.get(f'area high {config}')
        new_row[f'price low {config}'] = features.get(f'price low {config}')
        new_row[f'price high {config}'] = features.get(f'price high {config}')
    
    data_refined.append(new_row)

df_final_refined_v2 = pd.DataFrame(data_refined).set_index('PropertyName')

In [None]:
df_final_refined_v2['building type_Land'] = df_final_refined_v2['building type_Land'].replace({'':'Land'})

In [None]:
df['PriceDetails'][10]

In [None]:
df_final_refined_v2.iloc[10]

In [None]:
categorical_columns = df_final_refined_v2.select_dtypes(include=['object']).columns.tolist()

In [None]:
categorical_columns

In [None]:
ohe_df = pd.get_dummies(df_final_refined_v2, columns=categorical_columns, drop_first=True)

In [None]:
ohe_df

In [None]:
ohe_df.fillna(0, inplace=True)

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Apply the scaler to the entire dataframe
ohe_df_normalized = pd.DataFrame(scaler.fit_transform(ohe_df), columns=ohe_df.columns, index=ohe_df.index)

In [None]:
ohe_df_normalized.head()

In [None]:
# Compute the cosine similarity matrix
cosine_sim2 = cosine_similarity(ohe_df_normalized)

In [None]:
cosine_sim2.shape

In [None]:
def recommend_properties_with_scores(property_name, top_n=247):
    
    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim2[ohe_df_normalized.index.get_loc(property_name)]))
    
    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices and scores of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1:top_n+1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n+1]]
    
    # Retrieve the names of the top properties using the indices
    top_properties = ohe_df_normalized.index[top_indices].tolist()
    

    recommendations_df = pd.DataFrame({
        'PropertyName': top_properties,
        'SimilarityScore': top_scores
    })
    
    return recommendations_df


recommend_properties_with_scores('M3M Golf Hills')

In [None]:
def distance_to_meters(distance_str):
    try:
        if 'Km' in distance_str or 'KM' in distance_str:
            return float(distance_str.split()[0]) * 1000
        elif 'Meter' in distance_str or 'meter' in distance_str:
            return float(distance_str.split()[0])
        else:
            return None
    except:
        return None

In [None]:
import ast
# Extract distances for each location

location_matrix = {}
for index, row in df.iterrows():
    distances = {}
    for location, distance in ast.literal_eval(row['LocationAdvantages']).items():
        distances[location] = distance_to_meters(distance)
    location_matrix[index] = distances

# Convert the dictionary to a dataframe
location_df = pd.DataFrame.from_dict(location_matrix, orient='index')

# Display the first few rows
location_df.head()

In [None]:
location_df.columns[10:50]

In [None]:
location_df.index = df.PropertyName

In [None]:
location_df.head()

In [None]:
location_df.fillna(54000, inplace=True)

In [None]:
location_df

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Apply the scaler to the entire dataframe
location_df_normalized = pd.DataFrame(scaler.fit_transform(location_df), columns=location_df.columns, index=location_df.index)

In [None]:
location_df_normalized

In [None]:
cosine_sim3 = cosine_similarity(location_df_normalized)

In [None]:
def recommend_properties_with_scores(property_name, top_n=246):
    
    cosine_sim_matrix = 30*cosine_sim1 + 20*cosine_sim2 + 8*cosine_sim3
    # cosine_sim_matrix = cosine_sim3
    
    # Get the similarity scores for the property using its name as the index
    sim_scores = list(enumerate(cosine_sim_matrix[location_df_normalized.index.get_loc(property_name)]))
    
    # Sort properties based on the similarity scores
    sorted_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices and scores of the top_n most similar properties
    top_indices = [i[0] for i in sorted_scores[1:top_n+1]]
    top_scores = [i[1] for i in sorted_scores[1:top_n+1]]
    
    # Retrieve the names of the top properties using the indices
    top_properties = location_df_normalized.index[top_indices].tolist()
    
    # Create a dataframe with the results
    recommendations_df = pd.DataFrame({
        'PropertyName': top_properties,
        'SimilarityScore': top_scores
    })
    
    return recommendations_df

# Test the recommender function using a property name
recommend_properties_with_scores('Ireo Victory Valley')