<a href="https://colab.research.google.com/github/brdeleon/predicting_property_clicks/blob/main/property_click_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [None]:
pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (572 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m572.4/572.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7


In [None]:
pip install -U kaleido

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [None]:
pip install plotly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import warnings
warnings.filterwarnings("ignore")

import joblib as jb
from joblib import dump, load

import json
import shap
import math
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import kaleido
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold, cross_val_score, train_test_split

from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE, RFECV

from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, LabelEncoder

from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, classification_report, ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, CondensedNearestNeighbour 
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

from google.colab import files, drive
drive.mount('/content/drive')

seed = 42

Mounted at /content/drive


#Functions

In [None]:
# defining an evaluation classification function for automation and evaluating subsequent models
def evaluate_classification(model, X_train, X_test, y_train, y_test, classes=None, label=''):
    # retrieve predictions for train and validation data 
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # retrieve probabilites for train and validation data 
    y_hat_train = model.predict_proba(X_train)
    y_hat_test = model.predict_proba(X_test)
    
    # retrieve probabilities for the positive class
    pos_probs_train = y_hat_train[:, 1]
    pos_probs_test = y_hat_test[:, 1]

    # save the trained model
    model_filename = f"/content/drive/MyDrive/fraud_models/{label.replace(' ', '_')}_Model.pkl"
    dump(model, model_filename)
    
    # print training classification report 
    header = label + " Classification Report - Train"
    dashes = "---" * 20
    print(dashes, header, dashes, sep='\n')
    print(classification_report(y_train, y_pred_train, target_names=classes, digits=4))
    
    # display training figures 
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16,4))
    # adjust spacing between subplots
    fig.subplots_adjust(wspace=0.3, hspace=0.5) 
    
    # Plot a confusion matrix on the train data
    ConfusionMatrixDisplay.from_estimator(estimator=model, X=X_train, y=y_train, display_labels=classes, ax=axes[0])
    axes[0].set(title='Confusion Matrix - Train')
    axes[0].set_xlabel('Predicted Label')
    axes[0].set_ylabel('True Label')
    
    # plot ROC curve 
    RocCurveDisplay.from_estimator(model, X_train, y_train, name=label, ax=axes[1])
    roc = axes[1]
    roc.plot([0,1], [0,1], ls=':', label='No Skill')
    roc.grid()
    roc.set_title('Receiving Operator Characteristic - Train')
    
    # plot Precision-Recall curve
    PrecisionRecallDisplay.from_estimator(model, X_train, y_train, ax=axes[2], name=label)
    # y axis is Precision
    axes[2].set_ylabel('Precision')
    # x axis is Recall
    axes[2].set_xlabel('Recall')
    axes[2].set_title('Precision-Recall AUC - Train')

    fig.tight_layout()
    plt.savefig(f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Model_Evaluation_Train.png")
    plt.show();

    # print test classification report 
    header_ = label + " Classification Report - Test"
    print(dashes, header_, dashes, sep='\n')
    print(classification_report(y_test, y_pred_test, target_names=classes, digits=4))


    # display test figures 
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16,4))
    # adjust spacing between subplots
    fig.subplots_adjust(wspace=0.3, hspace=0.5) 
    
    # Plot a confusion matrix on the test data
    ConfusionMatrixDisplay.from_estimator(estimator=model, X=X_test, y=y_test, display_labels=classes, ax=axes[0])
    axes[0].set(title='Confusion Matrix - Test')
    axes[0].set_xlabel('Predicted Label')
    axes[0].set_ylabel('True Label')
    
    # plot ROC curve 
    RocCurveDisplay.from_estimator(model, X_test, y_test, name=label, ax=axes[1])
    axes[1].plot([0,1], [0,1], ls=':', label='No Skill')
    axes[1].grid()
    axes[1].set_title('Receiving Operator Characteristic - Test')
    
    # plot Precision-Recall curve
    PrecisionRecallDisplay.from_estimator(model, X_test, y_test, ax=axes[2], name=label)
    # y axis is Precision
    axes[2].set_ylabel('Precision')
    # x axis is Recall
    axes[2].set_xlabel('Recall')
    axes[2].set_title('Precision-Recall AUC - Test')
    
    plt.legend()
    fig.tight_layout()
    plt.savefig(f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Model_Evaluation_Test.png")
    plt.show();

In [None]:
def evaluate_algorithms(X_train, X_test, y_train, y_test, random_state=seed, sampling=None):
    """
    Evaluate different sampling methods with a given classifier on the given data.
    
    Parameters:
        - algorithms: a list of tuples, each containing a string name for the algorithm 
          and the sampler instance.
        - X_train: the training set features.
        - X_val: the validation set features.
        - y_train: the training set target.
        - y_val: the validation set target.
        - classifier: the classifier pipeline to use.
        - random_state: the random state for the classifier.
    """

    # defining the algorithms
    algorithms = [
    ('Dummy Classifier', DummyClassifier(strategy='most_frequent')),
    ('Logistic Regression', LogisticRegression(random_state=seed)),
    # ('K Nearest Neighbors', KNeighborsClassifier()),
    ('Bagging Classifier', BaggingClassifier(random_state=seed)),
    ('Gaussian Naive Bayes', GaussianNB()),
    ('Decision Tree Classifier', DecisionTreeClassifier(random_state=seed)),
    ('Random Forest Classifier', RandomForestClassifier(random_state=seed)),
    ('Extra Trees Classifier', ExtraTreesClassifier(random_state=seed)),
    ('AdaBoost Classifier', AdaBoostClassifier(random_state=seed)),
    ('Gradient Boosting Classifier', GradientBoostingClassifier(random_state=seed))
]

    for name, algorithm in algorithms:
        # Fit the pipeline on the training data
        algorithm.fit(X_train, y_train)

        # # Predict the classes on the validation data
        # y_pred_val = pipe.predict(X_test)

        if sampling:
          # Evaluate the pipeline on the validation data
          evaluate_classification(algorithm, X_train, X_test, y_train, y_test, label=f"{sampling} {name}")
        else:
          evaluate_classification(algorithm, X_train, X_test, y_train, y_test, label=f"{name}")

In [None]:
def plot_feature_importances_plotly(X_train, y_train, label=None):
    # Load pickled pipeline model; label should be the entire name of the algorithm
    model = jb.load(f"/content/drive/MyDrive/fraud_models/{label.replace(' ', '_')}_Model.pkl")

    # Pull feature importances from the trained model
    importances = model.feature_importances_

    # Sort feature importances in descending order
    indices = np.argsort(importances)[::-1]

    # Create a horizontal bar chart using Plotly with the most important features listed at the top
    fig = go.Figure(go.Bar(
                x=importances[indices],
                y=[X_train.columns[i] for i in indices],
                orientation='h',
                marker=dict(color=importances[indices],
                            colorbar=dict(title='Importance'))
                ))
    fig.update_layout(title=f"Feature Importances - {label}",
                      xaxis_title='Importance',
                      yaxis_title='',
                      width=1600,
                      height=1100)

    # Save the figure as png
    filename = f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Feature_Importances.png"
    pio.write_image(fig, filename)

    # Save the figure as html with interactive figure
    pio.write_html(fig, file=f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Feature_Importances.html", auto_open=True)

    # Show the plot
    fig.show()


In [None]:
def plot_coefficients_plotly(X_train, y_train, label=None):
    # Load the pickled model; label should be the entire name of the algorithm
    model = jb.load(f"/content/drive/MyDrive/fraud_models/{label.replace(' ', '_')}_Model.pkl")
    
    # Get the coefficients from the model and sort them by value
    # Create a dataframe with the names and values of the coefficients
    coefficients = pd.DataFrame({'Coefficient Name': X_train.columns, 'Coefficient Value': model.coef_.ravel().flatten()})
    
    # Keep only the features with non-zero coefficients
    coefficients = coefficients[coefficients['Coefficient Value'] != 0]
    
    # Sort the coefficients by value in descending order
    coefficients = coefficients.sort_values(by='Coefficient Value', ascending=False)

    # Create a horizontal bar chart using Plotly with a bigger figure size
    fig = go.Figure(go.Bar(
                x=coefficients['Coefficient Value'],
                y=coefficients['Coefficient Name'],
                orientation='h'))
    fig.update_layout(title=f"{label} Coefficients",
                      xaxis_title='Coefficient Value',
                      yaxis_title='',
                      width=1600,
                      height=1100)
    
    # Save the figure to a file with a descriptive name
    filename = f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Coefficients.png"
    pio.write_image(fig, filename)

    # Save the figure as html with interactive figure
    pio.write_html(fig, file=f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Coefficients.html", auto_open=True)

    # Show the plot
    fig.show()


In [None]:
def get_shapley_values_tree_plotly_violin(X_test, label=None, seed=42):
    # Load the pickled model; label should be the entire name of the algorithm
    model = jb.load(f"/content/drive/MyDrive/fraud_models/{label.replace(' ', '_')}_Model.pkl")
    
    # Create a Shapley explainer object using the TreeExplainer class
    explainer = shap.TreeExplainer(model)
    
    # Calculate the sample size that is equal to 10% X_test
    sample_size = int(len(X_test) * 0.10)

    # Randomly sample 10% of X_test
    sample = X_test.sample(n=sample_size, random_state=seed)

    # Compute the Shapley values for your testing data using the shap_values() function
    shap_values = explainer.shap_values(sample)

    # Sort the features by descending mean Shapley value
    feature_order = np.argsort(np.abs(shap_values[1]).mean(axis=0))[::-1]

    # Create a violin plot of the Shapley values using Plotly
    fig = go.Figure()

    for i in feature_order:
        fig.add_trace(go.Violin(y=[sample.columns[i]] * len(shap_values[1][:, i]),
                                x=shap_values[1][:, i],
                                box_visible=True,
                                line_color='blue',
                                meanline_visible=True,
                                fillcolor='lightseagreen',
                                opacity=0.6,
                                x0=i,
                                y0=sample.columns[i],
                                name=sample.columns[i],
                                orientation='h'))

    # Customize the appearance of the plot
    fig.update_layout(title="SHAP Summary Plot", xaxis_title="Shapley Value", yaxis_title="Feature",
                      yaxis=dict(tickmode='array', tickvals=list(sample.columns[feature_order]),
                                 ticktext=list(sample.columns[feature_order]),
                                 tickfont=dict(size=12), tickangle=0),
                      height=1200, width=1600, margin=dict(l=100, r=100, t=100, b=100))

    # Save the figure to a file with a descriptive name
    filename = f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Shapley_Values_Violin.png"
    pio.write_image(fig, filename)

    # Save the figure as html with interactive figure
    pio.write_html(fig, file=f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Shapley_Values_Violin.html", auto_open=True)

    fig.show()


In [None]:
def get_shapley_values_tree_plotly_scatter(X_test, label=None):
    # Load the pickled model; label should be the entire name of the algorithm
    model = jb.load(f"/content/drive/MyDrive/fraud_models/{label.replace(' ', '_')}_Model.pkl")
    
    # Create a Shapley explainer object using the TreeExplainer class
    explainer = shap.TreeExplainer(model)
    
    # Calculate the sample size that is equal to 10% X_test
    sample_size = int(len(X_test) * 0.10)

    # Randomly sample 10% of X_test
    sample = X_test.sample(n=sample_size, random_state=seed)

    # Compute the Shapley values for your testing data using the shap_values() function
    shap_values = explainer.shap_values(sample)

    # Get the mean absolute Shapley values for each feature
    mean_shap = np.abs(shap_values[1]).mean(axis=0)

    # Sort the features by ascending mean Shapley value
    feature_order = np.argsort(mean_shap)

    # Add more spacing between features by increasing the jitter
    jitter = np.random.normal(0, 0.02, size=shap_values[1].shape)

    # Create a jittered dot cluster plot of the Shapley values using Plotly
    fig = go.Figure()
    for i in feature_order:
        # Create a scatter plot for each feature in the sample
        # The x-axis shows the Shapley values for each row with added jitter
        # The y-axis shows the feature name
        # Each marker's color represents the value of the Shapley value for the corresponding row and feature
        fig.add_trace(go.Scatter(x=shap_values[1][:, i] + jitter[:, i], y=[sample.columns[i]] * len(sample),
                                 mode='markers', marker=dict(color=shap_values[1][:, i], colorscale='RdBu_r'),
                                 showlegend=False, name=sample.columns[i]))

    # Untilt y-axis ticks and make figure size larger
    fig.update_layout(title="Shapley Values Summary Plot", xaxis_title="Shapley Value", yaxis_title="Feature",
                      yaxis=dict(tickmode='array', tickvals=list(sample.columns[feature_order]),
                                 ticktext=list(sample.columns[feature_order]),
                                 tickfont=dict(size=12), tickangle=0),
                      height=1200, width=1600, margin=dict(l=100, r=100, t=100, b=100))
    
    # Save the figure to a file with a descriptive name
    filename = f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Shapley_Values_Scatter.png"
    pio.write_image(fig, filename)

    # Save the figure as html with interactive figure
    pio.write_html(fig, file=f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Shapley_Values_Scatter.html", auto_open=True)

    fig.show()


In [None]:
def get_shapley_values_tree_plotly_bar(X_test, label=None, seed=42):
    # Load the pickled model; label should be the entire name of the algorithm
    model = jb.load(f"/content/drive/MyDrive/fraud_models/{label.replace(' ', '_')}_Model.pkl")
    
    # Create a Shapley explainer object using the TreeExplainer class
    explainer = shap.TreeExplainer(model)
    
    # Calculate the sample size that is equal to 10% X_test
    sample_size = int(len(X_test) * 0.10)

    # Randomly sample 10% of X_test
    sample = X_test.sample(n=sample_size, random_state=seed)

    # Compute the Shapley values for your testing data using the shap_values() function
    shap_values = explainer.shap_values(sample)

    # Get the mean absolute Shapley values for each feature
    mean_shap = np.abs(shap_values[1]).mean(axis=0)

    # Filter out features with zero mean absolute Shapley values
    nonzero_mean_shap = mean_shap[mean_shap > 0]

    # Sort the features by descending mean Shapley value
    feature_order = np.argsort(-nonzero_mean_shap)

    # Create a bar plot of the non-zero mean absolute Shapley values using Plotly
    fig = go.Figure()
    
    fig.add_trace(go.Bar(x=nonzero_mean_shap[feature_order],
                         y=sample.columns[feature_order],
                         orientation='h', 
                         marker=dict(color=nonzero_mean_shap[feature_order], colorscale='RdBu_r')))
    
    # Customize the appearance of the plot
    fig.update_layout(title="SHAP Summary Plot", xaxis_title="Mean Absolute Shapley Value", yaxis_title="Feature",
                      yaxis=dict(tickmode='array', tickvals=list(sample.columns[feature_order]),
                                 ticktext=list(sample.columns[feature_order]),
                                 tickfont=dict(size=12), tickangle=0),
                      height=1200, width=1600, margin=dict(l=100, r=100, t=100, b=100))
    
    # Save the figure to a file with a descriptive name
    filename = f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Shapley_Values_Bar.png"
    pio.write_image(fig, filename)

    # Save the figure as html with interactive figure
    pio.write_html(fig, file=f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Shapley_Values_Bar.html", auto_open=True)

    fig.show()


#Exploratory Data Analysis

In [49]:
# read csv
prop = pd.read_csv('/content/drive/MyDrive/predicting_property_clicks/predicting_property_clicks_data/property_data_set.csv', parse_dates=['activation_date'], infer_datetime_format=True, dayfirst=True)

In [50]:
# preview columns, null, and data types
prop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28888 entries, 0 to 28887
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   property_id      28888 non-null  object        
 1   type             28888 non-null  object        
 2   activation_date  28888 non-null  datetime64[ns]
 3   bathroom         28888 non-null  int64         
 4   floor            28888 non-null  int64         
 5   total_floor      28845 non-null  float64       
 6   furnishing       28888 non-null  object        
 7   gym              28888 non-null  int64         
 8   latitude         28888 non-null  float64       
 9   longitude        28888 non-null  float64       
 10  lease_type       28888 non-null  object        
 11  lift             28888 non-null  int64         
 12  locality         28888 non-null  object        
 13  parking          28888 non-null  object        
 14  property_age     28888 non-null  int64

In [51]:
prop.sample(n=5, random_state=seed)

Unnamed: 0,property_id,type,activation_date,bathroom,floor,total_floor,furnishing,gym,latitude,longitude,...,lift,locality,parking,property_age,property_size,swimming_pool,pin_code,rent,deposit,building_type
12189,ff8081815a21c03b015a260ec79953b1,BHK2,2017-02-10 11:38:00,2,2,2.0,SEMI_FURNISHED,0,12.967506,77.53577,...,0,Vijaya Nagar,TWO_WHEELER,0,800,0,560040.0,15000,150000,IF
15017,ff8081815a4c18d8015a50b2a91c4c83,BHK1,2017-02-18 18:08:00,1,1,4.0,SEMI_FURNISHED,0,12.909803,77.621848,...,0,Roopena Agrahara,BOTH,0,400,0,560068.0,8500,25000,IF
4165,ff808181548fae600154954c85194c94,BHK2,2017-03-04 18:09:00,2,2,2.0,SEMI_FURNISHED,0,13.031049,77.635838,...,0,Kalyan Nagar,FOUR_WHEELER,1,1000,0,560043.0,13000,130000,IF
5339,ff808181566e233701566e5819ab0cfc,BHK1,2017-02-13 23:02:00,1,0,0.0,SEMI_FURNISHED,0,12.971531,77.638183,...,0,Indiranagar,BOTH,20,600,0,560008.0,20000,200000,IH
9542,ff80818159f9589a0159f966754d0512,BHK2,2017-02-03 12:39:00,1,0,0.0,NOT_FURNISHED,0,12.988283,77.53794,...,0,Basaveshwar Nagar,BOTH,30,900,0,560079.0,10000,100000,IH


In [52]:
len(prop['property_id'].unique())

28888

In [53]:
# read csv
interactions = pd.read_csv('/content/drive/MyDrive/predicting_property_clicks/predicting_property_clicks_data/property_interactions.csv', parse_dates=['request_date'], infer_datetime_format=True, dayfirst=True)

In [54]:
# preview columns, null, and data types
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170611 entries, 0 to 170610
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   property_id   170611 non-null  object        
 1   request_date  170611 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 2.6+ MB


In [55]:
interactions.sample(n=5, random_state=seed)

Unnamed: 0,property_id,request_date
15236,ff80818151669a4001517a8bac9d1deb,2017-02-28 16:33:54
21769,ff80818152f447a60152f472a46f0319,2017-03-10 23:47:49
142430,ff8081815ab80870015ac147b8927a08,2017-04-02 20:34:20
115405,ff8081815a849126015a84c657061cd3,2017-03-16 12:37:33
69512,ff8081815a08262d015a092ddba37db2,2017-02-25 11:24:04


In [56]:
len(interactions['property_id'].unique())

22779

In [57]:
interactions['property_id'].value_counts()

ff80818159e061800159e4614bf4368d    193
ff8081815ae03cc6015aeb9caaf065ee    183
ff80818159c49a360159c600065a0c66    168
ff8081815a8d7af5015a8d85892d05ef    165
ff80818156b651790156b6cb4b360940    145
                                   ... 
ff808181553483e40155349397170173      1
ff8081815a52a425015a56934cc31473      1
ff80818155388516015538beb1fc043e      1
ff8081815ab805ff015abd5da64406f5      1
ff8081815470b1de0154765ab7ad5a9d      1
Name: property_id, Length: 22779, dtype: int64

In [60]:
# read csv
file_path = '/content/drive/MyDrive/predicting_property_clicks/predicting_property_clicks_data/property_photos.tsv'
photos = pd.read_table(file_path, sep='\t')

In [61]:
# preview columns, null, and data types
photos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28888 entries, 0 to 28887
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   property_id  28888 non-null  object
 1   photo_urls   17866 non-null  object
dtypes: object(2)
memory usage: 451.5+ KB


In [62]:
photos.sample(n=5, random_state=seed)

Unnamed: 0,property_id,photo_urls
12189,ff8081815a21c03b015a260ec79953b1,
15017,ff8081815a4c18d8015a50b2a91c4c83,
4165,ff808181548fae600154954c85194c94,"[{\title\"":\""\"",\""name\"":\""files[]\"",\""imagesM..."
5339,ff808181566e233701566e5819ab0cfc,"[{\title\"":\""\"",\""name\"":\""files[]\"",\""imagesM..."
9542,ff80818159f9589a0159f966754d0512,"[{\title\"":\""\"",\""name\"":\""files[]\"",\""imagesM..."


In [63]:
def clean_photo_urls(s):
    if isinstance(s, float) or s == 'NaN':
        return []
    else:
        s = s.replace('\\', '').replace('{title', '{"title').replace(']"', ']').replace('],"', ']","')
        try:
            json_data = json.loads(s)
            return json_data
        except:
            return []

def photo_count(s):
    if isinstance(s, float) or s == 'NaN':
        return 0
    else:
        s = s.replace('\\', '').replace('{title', '{"title').replace(']"', ']').replace('],"', ']","')
        try:
            json_data = json.loads(s)
            return len(json_data)
        except:
            return 0

photos['photo_urls_clean'] = photos['photo_urls'].apply(clean_photo_urls)
photos['photos_count'] = photos['photo_urls'].apply(photo_count)

# Drop the original 'photo_urls' column and rename 'photo_urls_clean' to 'photo_urls'
photos.drop(columns=['photo_urls'], inplace=True)
photos.rename(columns={'photo_urls_clean': 'photo_urls'}, inplace=True)


In [64]:
photos.sample(n=5, random_state=seed)

Unnamed: 0,property_id,photo_urls,photos_count
12189,ff8081815a21c03b015a260ec79953b1,[],0
15017,ff8081815a4c18d8015a50b2a91c4c83,[],0
4165,ff808181548fae600154954c85194c94,"[{'title': '', 'name': 'files[]', 'imagesMap':...",8
5339,ff808181566e233701566e5819ab0cfc,"[{'title': '', 'name': 'files[]', 'imagesMap':...",7
9542,ff80818159f9589a0159f966754d0512,"[{'title': '', 'name': 'files[]', 'imagesMap':...",7


In [67]:
len(photos['property_id'].unique())

28888

In [68]:
photos['property_id'].value_counts()

ff808081469fd6e20146a5af948000ea    1
ff8081815a92d719015a92f5001506ca    1
ff8081815a92d438015a9323d0e82746    1
ff8081815a92d438015a932367102712    1
ff8081815a92d438015a931d5ed524d7    1
                                   ..
ff80818159fa0f0f0159fd411ce47d95    1
ff80818159fa0f0f0159fd3fd5b37ce9    1
ff80818159fa0f0f0159fd3fbd907cdf    1
ff80818159fa0f0f0159fd3bdb887a9f    1
ff8081815b2007fc015b201c77a20395    1
Name: property_id, Length: 28888, dtype: int64

In [65]:
# merging prop and interactions dataframes on property id with a left join
merged = pd.merge(prop, interactions, how='outer', left_on='property_id', right_on='property_id')

In [66]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 176720 entries, 0 to 176719
Data columns (total 22 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   property_id      176720 non-null  object        
 1   type             176720 non-null  object        
 2   activation_date  176720 non-null  datetime64[ns]
 3   bathroom         176720 non-null  int64         
 4   floor            176720 non-null  int64         
 5   total_floor      176554 non-null  float64       
 6   furnishing       176720 non-null  object        
 7   gym              176720 non-null  int64         
 8   latitude         176720 non-null  float64       
 9   longitude        176720 non-null  float64       
 10  lease_type       176720 non-null  object        
 11  lift             176720 non-null  int64         
 12  locality         176720 non-null  object        
 13  parking          176720 non-null  object        
 14  property_age     176

In [69]:
# create the main DataFrames by joining users and transactions on ID and USER_ID
df = pd.merge(merged, photos, how='outer', left_on='property_id', right_on='property_id')

In [70]:
df.sample(n=25, random_state=seed)

Unnamed: 0,property_id,type,activation_date,bathroom,floor,total_floor,furnishing,gym,latitude,longitude,...,property_age,property_size,swimming_pool,pin_code,rent,deposit,building_type,request_date,photo_urls,photos_count
74375,ff8081815a125469015a12607eec0c1a,BHK1,2017-02-06 14:06:00,1,2,2.0,SEMI_FURNISHED,0,12.959641,77.717308,...,4,650,0,560066.0,15000,100000,IF,2017-02-07 00:20:44,[],0
37378,ff80818156186f0001561c6eafad5f19,BHK1,2017-03-20 17:39:00,1,2,3.0,SEMI_FURNISHED,0,13.006698,77.636036,...,6,900,0,560033.0,8000,100000,IF,2017-04-01 12:20:06,"[{'title': '', 'name': 'files[]', 'imagesMap':...",5
20066,ff808181528481080152886f88331d5e,BHK3,2017-03-28 22:25:00,2,0,7.0,FULLY_FURNISHED,1,12.961435,77.524452,...,1,1200,1,560040.0,30000,300000,AP,2017-03-29 15:16:51,"[{'title': '', 'name': 'files[]', 'imagesMap':...",10
2862,ff8081814cb771fb014ccb09506b3b35,BHK1,2017-03-06 18:55:00,1,2,4.0,SEMI_FURNISHED,0,12.920241,77.668681,...,1,450,0,560102.0,14000,72000,AP,2017-04-11 12:32:09,"[{'title': 'Hall', 'name': 'photo (1).JPG', 'i...",7
52796,ff80818158f379580158f7a8a0af45e4,BHK2,2017-02-27 17:24:00,2,1,2.0,SEMI_FURNISHED,0,12.940798,77.584876,...,5,1000,0,560011.0,15000,150000,IF,2017-03-13 12:45:32,[],0
158714,ff8081815ae03cc6015ae4c1690f2b7e,BHK2,2017-03-19 18:30:00,2,2,4.0,SEMI_FURNISHED,0,12.949293,77.579718,...,0,800,0,560004.0,25000,275000,IF,2017-03-27 17:23:14,[],0
94707,ff8081815a458fc5015a464e1b1c1c34,BHK1,2017-02-25 16:43:00,1,2,2.0,SEMI_FURNISHED,0,12.8789,77.608987,...,4,500,0,560076.0,8000,80000,IF,2017-03-09 17:33:28,"[{'title': '', 'name': 'files[]', 'imagesMap':...",7
144052,ff8081815ab805ff015abd747404142c,BHK2,2017-03-11 19:31:00,2,0,2.0,SEMI_FURNISHED,0,13.011834,77.626064,...,0,750,0,560084.0,8000,80000,IF,2017-03-20 16:05:50,[],0
107734,ff8081815a69bafb015a6a31d88c4817,BHK2,2017-02-23 15:12:00,1,0,1.0,SEMI_FURNISHED,0,12.926063,77.544098,...,10,900,0,560085.0,9000,99000,IF,2017-03-14 09:50:58,[],0
127844,ff8081815a9514a4015a984b8c153cbf,BHK2,2017-03-04 17:51:00,2,2,2.0,SEMI_FURNISHED,0,12.915464,77.610448,...,5,1200,0,560076.0,16000,100000,IF,2017-03-28 22:41:02,[],0


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 176720 entries, 0 to 176719
Data columns (total 24 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   property_id      176720 non-null  object        
 1   type             176720 non-null  object        
 2   activation_date  176720 non-null  datetime64[ns]
 3   bathroom         176720 non-null  int64         
 4   floor            176720 non-null  int64         
 5   total_floor      176554 non-null  float64       
 6   furnishing       176720 non-null  object        
 7   gym              176720 non-null  int64         
 8   latitude         176720 non-null  float64       
 9   longitude        176720 non-null  float64       
 10  lease_type       176720 non-null  object        
 11  lift             176720 non-null  int64         
 12  locality         176720 non-null  object        
 13  parking          176720 non-null  object        
 14  property_age     176

In [None]:
# bar chart for null observation
import missingno as msno
msno.bar(df, sort='descending');

In [72]:
# calculating nulls
nulls = df.isnull().sum()

In [73]:
# calculating null percentage of columns with nulls
null_percent = nulls[nulls>0] / len(df)
null_percent.sort_values(ascending=False).to_frame('% Null').style

Unnamed: 0,% Null
request_date,0.034569
pin_code,0.007554
building_type,0.0015
total_floor,0.000939


In [84]:
df['request_date'].value_counts(dropna=False)

0                      6109
2017-03-22 16:40:02       3
2017-03-30 11:42:23       3
2017-03-04 11:44:27       3
2017-02-11 17:41:12       3
                       ... 
2017-03-08 20:33:57       1
2017-03-22 13:37:42       1
2017-03-22 16:28:43       1
2017-03-20 10:26:20       1
2017-03-31 11:15:06       1
Name: request_date, Length: 166504, dtype: int64

In [85]:
df['pin_code'].value_counts(dropna=False)

560037.0    13490
560076.0     9641
560068.0     8557
560043.0     7095
560066.0     7033
            ...  
411060.0        1
560115.0        1
411014.0        1
600115.0        1
566222.0        1
Name: pin_code, Length: 213, dtype: int64

In [86]:
df['building_type'].value_counts(dropna=False)

IF    107164
AP     57911
IH     11380
0        265
Name: building_type, dtype: int64

In [87]:
df['total_floor'].value_counts(dropna=False)

2.0     48828
3.0     44835
4.0     34171
1.0     22922
5.0      9818
0.0      4476
6.0      1249
8.0      1206
14.0     1134
12.0     1094
7.0      1094
10.0      925
9.0       900
11.0      832
13.0      762
19.0      547
18.0      536
15.0      384
20.0      206
16.0      179
21.0      172
0         166
17.0      120
23.0       36
27.0       24
24.0       22
26.0       19
22.0       16
25.0       13
28.0       10
32.0        9
29.0        7
30.0        2
35.0        2
38.0        2
31.0        1
34.0        1
Name: total_floor, dtype: int64

In [82]:
# no excessive nulls
df.fillna('0', inplace=True)

In [83]:
print(df.isnull().sum())

property_id           0
type                  0
activation_date       0
bathroom              0
floor                 0
total_floor           0
furnishing            0
gym                   0
latitude              0
longitude             0
lease_type            0
lift                  0
locality              0
parking               0
property_age          0
property_size         0
swimming_pool         0
pin_code              0
rent                  0
deposit               0
building_type         0
request_date          0
photo_urls            0
photos_count          0
interactions_3days    0
interactions_7days    0
dtype: int64


### Identify/Explore Features
property_id


In [91]:
# checking column values and remaining nulls
df['property_id'].value_counts(dropna=False)

ff80818159e061800159e4614bf4368d    193
ff8081815ae03cc6015aeb9caaf065ee    183
ff80818159c49a360159c600065a0c66    168
ff8081815a8d7af5015a8d85892d05ef    165
ff80818156b651790156b6cb4b360940    145
                                   ... 
ff808181523fe1ba01524038344c05eb      1
ff808181568dbe1d01568e3ce95315cf      1
ff8081815a368c26015a36e120fa5108      1
ff8081815a368c26015a36d4757b44fc      1
ff80818153ccd7a70153d14403742a0e      1
Name: property_id, Length: 28888, dtype: int64

Notes: Some properties appear to have received over 150 interactions. 

In [92]:
# Counting number of unique properties
len(prop['property_id'].unique())

28888

type


In [93]:
# checking column values and remaining nulls
df['type'].value_counts(dropna=False)

BHK2        83360
BHK1        54579
BHK3        20057
RK1         17555
BHK4         1044
BHK4PLUS      125
Name: type, dtype: int64

### Engineer Target

We would like to have a predictive model that would say the number of interactions that a property would receive in a period of time. For simplicity let’s say we would like to predict the number of interactions that a property would receive within 3 days of its activation and 7 days of its activation. 

In [80]:
# Create a new column for the number of interactions within 3 days
df['interactions_3days'] = 0

# Create a new column for the number of interactions within 7 days
df['interactions_7days'] = 0

# Calculate the number of interactions within 3 days and 7 days of activation
for i, activation_date in enumerate(df['activation_date']):
    mask_3days = (df['request_date'] >= activation_date) & (df['request_date'] <= activation_date + pd.Timedelta(days=3))
    mask_7days = (df['request_date'] >= activation_date) & (df['request_date'] <= activation_date + pd.Timedelta(days=7))
    df.at[i, 'interactions_3days'] = mask_3days.sum()
    df.at[i, 'interactions_7days'] = mask_7days.sum()
