<a href="https://colab.research.google.com/github/brdeleon/predicting_property_clicks/blob/main/property_click_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [None]:
pip install shap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting shap
  Downloading shap-0.41.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (572 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m572.4/572.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.41.0 slicer-0.0.7


In [None]:
pip install -U kaleido

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [None]:
pip install plotly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import warnings
warnings.filterwarnings("ignore")

import joblib as jb
from joblib import dump, load

import json
import shap
import math
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import kaleido
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold, cross_val_score, train_test_split

from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE, RFECV

from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, LabelEncoder

from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, classification_report, ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, CondensedNearestNeighbour 
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

from google.colab import files, drive
drive.mount('/content/drive')

seed = 42

Mounted at /content/drive


#Functions

In [None]:
# defining an evaluation classification function for automation and evaluating subsequent models
def evaluate_classification(model, X_train, X_test, y_train, y_test, classes=None, label=''):
    # retrieve predictions for train and validation data 
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # retrieve probabilites for train and validation data 
    y_hat_train = model.predict_proba(X_train)
    y_hat_test = model.predict_proba(X_test)
    
    # retrieve probabilities for the positive class
    pos_probs_train = y_hat_train[:, 1]
    pos_probs_test = y_hat_test[:, 1]

    # save the trained model
    model_filename = f"/content/drive/MyDrive/fraud_models/{label.replace(' ', '_')}_Model.pkl"
    dump(model, model_filename)
    
    # print training classification report 
    header = label + " Classification Report - Train"
    dashes = "---" * 20
    print(dashes, header, dashes, sep='\n')
    print(classification_report(y_train, y_pred_train, target_names=classes, digits=4))
    
    # display training figures 
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16,4))
    # adjust spacing between subplots
    fig.subplots_adjust(wspace=0.3, hspace=0.5) 
    
    # Plot a confusion matrix on the train data
    ConfusionMatrixDisplay.from_estimator(estimator=model, X=X_train, y=y_train, display_labels=classes, ax=axes[0])
    axes[0].set(title='Confusion Matrix - Train')
    axes[0].set_xlabel('Predicted Label')
    axes[0].set_ylabel('True Label')
    
    # plot ROC curve 
    RocCurveDisplay.from_estimator(model, X_train, y_train, name=label, ax=axes[1])
    roc = axes[1]
    roc.plot([0,1], [0,1], ls=':', label='No Skill')
    roc.grid()
    roc.set_title('Receiving Operator Characteristic - Train')
    
    # plot Precision-Recall curve
    PrecisionRecallDisplay.from_estimator(model, X_train, y_train, ax=axes[2], name=label)
    # y axis is Precision
    axes[2].set_ylabel('Precision')
    # x axis is Recall
    axes[2].set_xlabel('Recall')
    axes[2].set_title('Precision-Recall AUC - Train')

    fig.tight_layout()
    plt.savefig(f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Model_Evaluation_Train.png")
    plt.show();

    # print test classification report 
    header_ = label + " Classification Report - Test"
    print(dashes, header_, dashes, sep='\n')
    print(classification_report(y_test, y_pred_test, target_names=classes, digits=4))


    # display test figures 
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16,4))
    # adjust spacing between subplots
    fig.subplots_adjust(wspace=0.3, hspace=0.5) 
    
    # Plot a confusion matrix on the test data
    ConfusionMatrixDisplay.from_estimator(estimator=model, X=X_test, y=y_test, display_labels=classes, ax=axes[0])
    axes[0].set(title='Confusion Matrix - Test')
    axes[0].set_xlabel('Predicted Label')
    axes[0].set_ylabel('True Label')
    
    # plot ROC curve 
    RocCurveDisplay.from_estimator(model, X_test, y_test, name=label, ax=axes[1])
    axes[1].plot([0,1], [0,1], ls=':', label='No Skill')
    axes[1].grid()
    axes[1].set_title('Receiving Operator Characteristic - Test')
    
    # plot Precision-Recall curve
    PrecisionRecallDisplay.from_estimator(model, X_test, y_test, ax=axes[2], name=label)
    # y axis is Precision
    axes[2].set_ylabel('Precision')
    # x axis is Recall
    axes[2].set_xlabel('Recall')
    axes[2].set_title('Precision-Recall AUC - Test')
    
    plt.legend()
    fig.tight_layout()
    plt.savefig(f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Model_Evaluation_Test.png")
    plt.show();

In [None]:
def evaluate_algorithms(X_train, X_test, y_train, y_test, random_state=seed, sampling=None):
    """
    Evaluate different sampling methods with a given classifier on the given data.
    
    Parameters:
        - algorithms: a list of tuples, each containing a string name for the algorithm 
          and the sampler instance.
        - X_train: the training set features.
        - X_val: the validation set features.
        - y_train: the training set target.
        - y_val: the validation set target.
        - classifier: the classifier pipeline to use.
        - random_state: the random state for the classifier.
    """

    # defining the algorithms
    algorithms = [
    ('Dummy Classifier', DummyClassifier(strategy='most_frequent')),
    ('Logistic Regression', LogisticRegression(random_state=seed)),
    # ('K Nearest Neighbors', KNeighborsClassifier()),
    ('Bagging Classifier', BaggingClassifier(random_state=seed)),
    ('Gaussian Naive Bayes', GaussianNB()),
    ('Decision Tree Classifier', DecisionTreeClassifier(random_state=seed)),
    ('Random Forest Classifier', RandomForestClassifier(random_state=seed)),
    ('Extra Trees Classifier', ExtraTreesClassifier(random_state=seed)),
    ('AdaBoost Classifier', AdaBoostClassifier(random_state=seed)),
    ('Gradient Boosting Classifier', GradientBoostingClassifier(random_state=seed))
]

    for name, algorithm in algorithms:
        # Fit the pipeline on the training data
        algorithm.fit(X_train, y_train)

        # # Predict the classes on the validation data
        # y_pred_val = pipe.predict(X_test)

        if sampling:
          # Evaluate the pipeline on the validation data
          evaluate_classification(algorithm, X_train, X_test, y_train, y_test, label=f"{sampling} {name}")
        else:
          evaluate_classification(algorithm, X_train, X_test, y_train, y_test, label=f"{name}")

In [None]:
def plot_feature_importances_plotly(X_train, y_train, label=None):
    # Load pickled pipeline model; label should be the entire name of the algorithm
    model = jb.load(f"/content/drive/MyDrive/fraud_models/{label.replace(' ', '_')}_Model.pkl")

    # Pull feature importances from the trained model
    importances = model.feature_importances_

    # Sort feature importances in descending order
    indices = np.argsort(importances)[::-1]

    # Create a horizontal bar chart using Plotly with the most important features listed at the top
    fig = go.Figure(go.Bar(
                x=importances[indices],
                y=[X_train.columns[i] for i in indices],
                orientation='h',
                marker=dict(color=importances[indices],
                            colorbar=dict(title='Importance'))
                ))
    fig.update_layout(title=f"Feature Importances - {label}",
                      xaxis_title='Importance',
                      yaxis_title='',
                      width=1600,
                      height=1100)

    # Save the figure as png
    filename = f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Feature_Importances.png"
    pio.write_image(fig, filename)

    # Save the figure as html with interactive figure
    pio.write_html(fig, file=f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Feature_Importances.html", auto_open=True)

    # Show the plot
    fig.show()


In [None]:
def plot_coefficients_plotly(X_train, y_train, label=None):
    # Load the pickled model; label should be the entire name of the algorithm
    model = jb.load(f"/content/drive/MyDrive/fraud_models/{label.replace(' ', '_')}_Model.pkl")
    
    # Get the coefficients from the model and sort them by value
    # Create a dataframe with the names and values of the coefficients
    coefficients = pd.DataFrame({'Coefficient Name': X_train.columns, 'Coefficient Value': model.coef_.ravel().flatten()})
    
    # Keep only the features with non-zero coefficients
    coefficients = coefficients[coefficients['Coefficient Value'] != 0]
    
    # Sort the coefficients by value in descending order
    coefficients = coefficients.sort_values(by='Coefficient Value', ascending=False)

    # Create a horizontal bar chart using Plotly with a bigger figure size
    fig = go.Figure(go.Bar(
                x=coefficients['Coefficient Value'],
                y=coefficients['Coefficient Name'],
                orientation='h'))
    fig.update_layout(title=f"{label} Coefficients",
                      xaxis_title='Coefficient Value',
                      yaxis_title='',
                      width=1600,
                      height=1100)
    
    # Save the figure to a file with a descriptive name
    filename = f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Coefficients.png"
    pio.write_image(fig, filename)

    # Save the figure as html with interactive figure
    pio.write_html(fig, file=f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Coefficients.html", auto_open=True)

    # Show the plot
    fig.show()


In [None]:
def get_shapley_values_tree_plotly_violin(X_test, label=None, seed=42):
    # Load the pickled model; label should be the entire name of the algorithm
    model = jb.load(f"/content/drive/MyDrive/fraud_models/{label.replace(' ', '_')}_Model.pkl")
    
    # Create a Shapley explainer object using the TreeExplainer class
    explainer = shap.TreeExplainer(model)
    
    # Calculate the sample size that is equal to 10% X_test
    sample_size = int(len(X_test) * 0.10)

    # Randomly sample 10% of X_test
    sample = X_test.sample(n=sample_size, random_state=seed)

    # Compute the Shapley values for your testing data using the shap_values() function
    shap_values = explainer.shap_values(sample)

    # Sort the features by descending mean Shapley value
    feature_order = np.argsort(np.abs(shap_values[1]).mean(axis=0))[::-1]

    # Create a violin plot of the Shapley values using Plotly
    fig = go.Figure()

    for i in feature_order:
        fig.add_trace(go.Violin(y=[sample.columns[i]] * len(shap_values[1][:, i]),
                                x=shap_values[1][:, i],
                                box_visible=True,
                                line_color='blue',
                                meanline_visible=True,
                                fillcolor='lightseagreen',
                                opacity=0.6,
                                x0=i,
                                y0=sample.columns[i],
                                name=sample.columns[i],
                                orientation='h'))

    # Customize the appearance of the plot
    fig.update_layout(title="SHAP Summary Plot", xaxis_title="Shapley Value", yaxis_title="Feature",
                      yaxis=dict(tickmode='array', tickvals=list(sample.columns[feature_order]),
                                 ticktext=list(sample.columns[feature_order]),
                                 tickfont=dict(size=12), tickangle=0),
                      height=1200, width=1600, margin=dict(l=100, r=100, t=100, b=100))

    # Save the figure to a file with a descriptive name
    filename = f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Shapley_Values_Violin.png"
    pio.write_image(fig, filename)

    # Save the figure as html with interactive figure
    pio.write_html(fig, file=f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Shapley_Values_Violin.html", auto_open=True)

    fig.show()


In [None]:
def get_shapley_values_tree_plotly_scatter(X_test, label=None):
    # Load the pickled model; label should be the entire name of the algorithm
    model = jb.load(f"/content/drive/MyDrive/fraud_models/{label.replace(' ', '_')}_Model.pkl")
    
    # Create a Shapley explainer object using the TreeExplainer class
    explainer = shap.TreeExplainer(model)
    
    # Calculate the sample size that is equal to 10% X_test
    sample_size = int(len(X_test) * 0.10)

    # Randomly sample 10% of X_test
    sample = X_test.sample(n=sample_size, random_state=seed)

    # Compute the Shapley values for your testing data using the shap_values() function
    shap_values = explainer.shap_values(sample)

    # Get the mean absolute Shapley values for each feature
    mean_shap = np.abs(shap_values[1]).mean(axis=0)

    # Sort the features by ascending mean Shapley value
    feature_order = np.argsort(mean_shap)

    # Add more spacing between features by increasing the jitter
    jitter = np.random.normal(0, 0.02, size=shap_values[1].shape)

    # Create a jittered dot cluster plot of the Shapley values using Plotly
    fig = go.Figure()
    for i in feature_order:
        # Create a scatter plot for each feature in the sample
        # The x-axis shows the Shapley values for each row with added jitter
        # The y-axis shows the feature name
        # Each marker's color represents the value of the Shapley value for the corresponding row and feature
        fig.add_trace(go.Scatter(x=shap_values[1][:, i] + jitter[:, i], y=[sample.columns[i]] * len(sample),
                                 mode='markers', marker=dict(color=shap_values[1][:, i], colorscale='RdBu_r'),
                                 showlegend=False, name=sample.columns[i]))

    # Untilt y-axis ticks and make figure size larger
    fig.update_layout(title="Shapley Values Summary Plot", xaxis_title="Shapley Value", yaxis_title="Feature",
                      yaxis=dict(tickmode='array', tickvals=list(sample.columns[feature_order]),
                                 ticktext=list(sample.columns[feature_order]),
                                 tickfont=dict(size=12), tickangle=0),
                      height=1200, width=1600, margin=dict(l=100, r=100, t=100, b=100))
    
    # Save the figure to a file with a descriptive name
    filename = f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Shapley_Values_Scatter.png"
    pio.write_image(fig, filename)

    # Save the figure as html with interactive figure
    pio.write_html(fig, file=f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Shapley_Values_Scatter.html", auto_open=True)

    fig.show()


In [None]:
def get_shapley_values_tree_plotly_bar(X_test, label=None, seed=42):
    # Load the pickled model; label should be the entire name of the algorithm
    model = jb.load(f"/content/drive/MyDrive/fraud_models/{label.replace(' ', '_')}_Model.pkl")
    
    # Create a Shapley explainer object using the TreeExplainer class
    explainer = shap.TreeExplainer(model)
    
    # Calculate the sample size that is equal to 10% X_test
    sample_size = int(len(X_test) * 0.10)

    # Randomly sample 10% of X_test
    sample = X_test.sample(n=sample_size, random_state=seed)

    # Compute the Shapley values for your testing data using the shap_values() function
    shap_values = explainer.shap_values(sample)

    # Get the mean absolute Shapley values for each feature
    mean_shap = np.abs(shap_values[1]).mean(axis=0)

    # Filter out features with zero mean absolute Shapley values
    nonzero_mean_shap = mean_shap[mean_shap > 0]

    # Sort the features by descending mean Shapley value
    feature_order = np.argsort(-nonzero_mean_shap)

    # Create a bar plot of the non-zero mean absolute Shapley values using Plotly
    fig = go.Figure()
    
    fig.add_trace(go.Bar(x=nonzero_mean_shap[feature_order],
                         y=sample.columns[feature_order],
                         orientation='h', 
                         marker=dict(color=nonzero_mean_shap[feature_order], colorscale='RdBu_r')))
    
    # Customize the appearance of the plot
    fig.update_layout(title="SHAP Summary Plot", xaxis_title="Mean Absolute Shapley Value", yaxis_title="Feature",
                      yaxis=dict(tickmode='array', tickvals=list(sample.columns[feature_order]),
                                 ticktext=list(sample.columns[feature_order]),
                                 tickfont=dict(size=12), tickangle=0),
                      height=1200, width=1600, margin=dict(l=100, r=100, t=100, b=100))
    
    # Save the figure to a file with a descriptive name
    filename = f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Shapley_Values_Bar.png"
    pio.write_image(fig, filename)

    # Save the figure as html with interactive figure
    pio.write_html(fig, file=f"/content/drive/MyDrive/fraud_images/{label.replace(' ', '_')}_Shapley_Values_Bar.html", auto_open=True)

    fig.show()


#Exploratory Data Analysis

In [None]:
# read csv
prop = pd.read_csv('/content/drive/MyDrive/property_click_prediction_data/property_data_set.csv', parse_dates=['activation_date'], infer_datetime_format=True, dayfirst=True)

In [None]:
# preview columns, null, and data types
prop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28888 entries, 0 to 28887
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   property_id      28888 non-null  object        
 1   type             28888 non-null  object        
 2   activation_date  28888 non-null  datetime64[ns]
 3   bathroom         28888 non-null  int64         
 4   floor            28888 non-null  int64         
 5   total_floor      28845 non-null  float64       
 6   furnishing       28888 non-null  object        
 7   gym              28888 non-null  int64         
 8   latitude         28888 non-null  float64       
 9   longitude        28888 non-null  float64       
 10  lease_type       28888 non-null  object        
 11  lift             28888 non-null  int64         
 12  locality         28888 non-null  object        
 13  parking          28888 non-null  object        
 14  property_age     28888 non-null  int64

In [None]:
prop.sample(n=5, random_state=seed)

Unnamed: 0,property_id,type,activation_date,bathroom,floor,total_floor,furnishing,gym,latitude,longitude,...,lift,locality,parking,property_age,property_size,swimming_pool,pin_code,rent,deposit,building_type
12189,ff8081815a21c03b015a260ec79953b1,BHK2,2017-02-10 11:38:00,2,2,2.0,SEMI_FURNISHED,0,12.967506,77.53577,...,0,Vijaya Nagar,TWO_WHEELER,0,800,0,560040.0,15000,150000,IF
15017,ff8081815a4c18d8015a50b2a91c4c83,BHK1,2017-02-18 18:08:00,1,1,4.0,SEMI_FURNISHED,0,12.909803,77.621848,...,0,Roopena Agrahara,BOTH,0,400,0,560068.0,8500,25000,IF
4165,ff808181548fae600154954c85194c94,BHK2,2017-03-04 18:09:00,2,2,2.0,SEMI_FURNISHED,0,13.031049,77.635838,...,0,Kalyan Nagar,FOUR_WHEELER,1,1000,0,560043.0,13000,130000,IF
5339,ff808181566e233701566e5819ab0cfc,BHK1,2017-02-13 23:02:00,1,0,0.0,SEMI_FURNISHED,0,12.971531,77.638183,...,0,Indiranagar,BOTH,20,600,0,560008.0,20000,200000,IH
9542,ff80818159f9589a0159f966754d0512,BHK2,2017-02-03 12:39:00,1,0,0.0,NOT_FURNISHED,0,12.988283,77.53794,...,0,Basaveshwar Nagar,BOTH,30,900,0,560079.0,10000,100000,IH


In [None]:
# read csv
interactions = pd.read_csv('/content/drive/MyDrive/property_click_prediction_data/property_interactions.csv', parse_dates=['request_date'], infer_datetime_format=True, dayfirst=True)

In [None]:
# preview columns, null, and data types
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170611 entries, 0 to 170610
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   property_id   170611 non-null  object        
 1   request_date  170611 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 2.6+ MB


In [None]:
interactions.sample(n=5, random_state=seed)

Unnamed: 0,property_id,request_date
15236,ff80818151669a4001517a8bac9d1deb,2017-02-28 16:33:54
21769,ff80818152f447a60152f472a46f0319,2017-03-10 23:47:49
142430,ff8081815ab80870015ac147b8927a08,2017-04-02 20:34:20
115405,ff8081815a849126015a84c657061cd3,2017-03-16 12:37:33
69512,ff8081815a08262d015a092ddba37db2,2017-02-25 11:24:04


In [None]:
# read csv
file_path = '/content/drive/MyDrive/property_click_prediction_data/property_photos.tsv'
photos = pd.read_table(file_path, sep='\t')

In [None]:
# preview columns, null, and data types
photos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28888 entries, 0 to 28887
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   property_id  28888 non-null  object
 1   photo_urls   17866 non-null  object
dtypes: object(2)
memory usage: 451.5+ KB


In [None]:
photos.sample(n=5, random_state=seed)

Unnamed: 0,property_id,photo_urls
12189,ff8081815a21c03b015a260ec79953b1,
15017,ff8081815a4c18d8015a50b2a91c4c83,
4165,ff808181548fae600154954c85194c94,"[{\title\"":\""\"",\""name\"":\""files[]\"",\""imagesM..."
5339,ff808181566e233701566e5819ab0cfc,"[{\title\"":\""\"",\""name\"":\""files[]\"",\""imagesM..."
9542,ff80818159f9589a0159f966754d0512,"[{\title\"":\""\"",\""name\"":\""files[]\"",\""imagesM..."


In [None]:
def clean_photo_urls(s):
    if isinstance(s, float) or s == 'NaN':
        return []
    else:
        s = s.replace('\\', '').replace('{title', '{"title').replace(']"', ']').replace('],"', ']","')
        try:
            json_data = json.loads(s)
            return json_data
        except:
            return []

def photo_count(s):
    if isinstance(s, float) or s == 'NaN':
        return 0
    else:
        s = s.replace('\\', '').replace('{title', '{"title').replace(']"', ']').replace('],"', ']","')
        try:
            json_data = json.loads(s)
            return len(json_data)
        except:
            return 0

photos['photo_urls_clean'] = photos['photo_urls'].apply(clean_photo_urls)
photos['photos_count'] = photos['photo_urls'].apply(photo_count)

# Drop the original 'photo_urls' column and rename 'photo_urls_clean' to 'photo_urls'
photos.drop(columns=['photo_urls'], inplace=True)
photos.rename(columns={'photo_urls_clean': 'photo_urls'}, inplace=True)


In [None]:
photos.sample(n=5, random_state=seed)

Unnamed: 0,property_id,photo_urls,photos_count
12189,ff8081815a21c03b015a260ec79953b1,[],0
15017,ff8081815a4c18d8015a50b2a91c4c83,[],0
4165,ff808181548fae600154954c85194c94,"[{'title': '', 'name': 'files[]', 'imagesMap':...",8
5339,ff808181566e233701566e5819ab0cfc,"[{'title': '', 'name': 'files[]', 'imagesMap':...",7
9542,ff80818159f9589a0159f966754d0512,"[{'title': '', 'name': 'files[]', 'imagesMap':...",7


In [None]:
# create the main DataFrames by joining users and transactions on ID and USER_ID
df = pd.merge(users, transactions, left_on='ID', right_on='USER_ID')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 638742 entries, 0 to 638741
Data columns (total 23 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   ID_x                     638742 non-null  object 
 1   HAS_EMAIL                638742 non-null  int64  
 2   PHONE_COUNTRY            638742 non-null  object 
 3   IS_FRAUDSTER             638742 non-null  bool   
 4   TERMS_VERSION            613084 non-null  object 
 5   CREATED_DATE_x           638742 non-null  object 
 6   STATE_x                  638742 non-null  object 
 7   COUNTRY                  638742 non-null  object 
 8   BIRTH_YEAR               638742 non-null  int64  
 9   KYC                      638742 non-null  object 
 10  FAILED_SIGN_IN_ATTEMPTS  638742 non-null  int64  
 11  CURRENCY                 638742 non-null  object 
 12  AMOUNT                   638742 non-null  int64  
 13  STATE_y                  638742 non-null  object 
 14  CREA

In [None]:
df.sample(n=25, random_state=seed)

Unnamed: 0,ID_x,HAS_EMAIL,PHONE_COUNTRY,IS_FRAUDSTER,TERMS_VERSION,CREATED_DATE_x,STATE_x,COUNTRY,BIRTH_YEAR,KYC,...,STATE_y,CREATED_DATE_y,MERCHANT_CATEGORY,MERCHANT_COUNTRY,ENTRY_METHOD,USER_ID,TYPE,SOURCE,ID_y,AMOUNT_USD
320630,0e0f16a9-c589-444d-b06d-0e8c1d99fdab,1,GB||JE||IM||GG,False,2018-09-20,2016-09-25 17:54:50.354000,ACTIVE,GB,1991,PASSED,...,COMPLETED,2018-06-28 17:01:36.03,restaurant,NLD,chip,0e0f16a9-c589-444d-b06d-0e8c1d99fdab,CARD_PAYMENT,GAIA,2f8e9d45-9b66-4558-b7b9-eb1db98c6e59,4567.0
348581,b21decec-723f-4c5b-a279-d3243a727d37,1,PL,False,2018-05-25,2018-03-13 11:47:49.646000,ACTIVE,PL,1986,PASSED,...,COMPLETED,2018-07-06 23:59:08.829,,NLD,manu,b21decec-723f-4c5b-a279-d3243a727d37,CARD_PAYMENT,GAIA,775b814a-d60f-4c42-accf-68f56ec9245b,
182260,8ffe11e3-b414-4915-8ea4-70f5b384242d,1,GR,False,2018-09-20,2016-12-25 15:42:20.158000,ACTIVE,GR,1971,PASSED,...,COMPLETED,2018-06-04 20:52:54.126,,,misc,8ffe11e3-b414-4915-8ea4-70f5b384242d,P2P,INTERNAL,1d47654d-9660-4e4e-858b-93710474dbc0,10142.0
243368,3240b44b-a36f-4121-9597-2663152287f5,1,GB||JE||IM||GG,False,2017-02-02,2016-10-28 20:31:46.411000,ACTIVE,GB,1975,PASSED,...,COMPLETED,2018-03-12 19:33:51.576,,GBR,chip,3240b44b-a36f-4121-9597-2663152287f5,ATM,GAIA,80478bb1-0300-4dc4-ad63-2b78b544443b,1533.0
385547,c08d9b31-5795-475e-8c00-1ae53d3f2a69,1,GB||JE||IM||GG,False,2018-09-20,2018-06-07 21:15:57.685000,ACTIVE,GB,1981,PASSED,...,COMPLETED,2018-06-20 18:48:11.31,,,misc,c08d9b31-5795-475e-8c00-1ae53d3f2a69,P2P,INTERNAL,3cd4688e-0534-48f0-b130-3c0b01a2edf5,172.0
4759,c2c667e2-bc43-4e85-86e2-a6fad62d3086,1,LT,False,2018-01-13,2016-12-06 17:38:35.872000,ACTIVE,LT,1997,PASSED,...,FAILED,2017-10-24 10:07:59.63,,,misc,c2c667e2-bc43-4e85-86e2-a6fad62d3086,TOPUP,HERA,03e774af-eb47-4fce-85f0-9782b7465985,100.0
553055,ed8913ba-8f6e-4e72-89c8-65e1725c24bb,1,GB||JE||IM||GG,False,2018-09-20,2017-07-04 05:08:56.646000,ACTIVE,GB,1996,PASSED,...,COMPLETED,2017-11-04 17:26:28.558,,USA,chip,ed8913ba-8f6e-4e72-89c8-65e1725c24bb,CARD_PAYMENT,GAIA,15c385b8-3519-43d8-a551-7102ece5fe5a,538.0
366592,d5e696b5-5d00-46b8-9117-d515a82ee384,1,GB||JE||IM||GG,False,2018-05-25,2018-03-14 10:28:16.725000,ACTIVE,GB,1963,PASSED,...,COMPLETED,2018-05-27 09:46:03.338,point_of_interest,ITA,chip,d5e696b5-5d00-46b8-9117-d515a82ee384,CARD_PAYMENT,GAIA,e295eefe-1985-44d0-826f-d74002bd33a8,845.0
176009,8250bc09-841c-42cc-af55-435e963c792d,1,GB||JE||IM||GG,False,2018-09-20,2016-11-28 11:38:52.220000,ACTIVE,GB,1987,PASSED,...,FAILED,2018-07-16 07:32:38.467,,,misc,8250bc09-841c-42cc-af55-435e963c792d,TOPUP,HERA,916a211e-cad0-4e80-a0fe-f9c9853cb295,29954.0
297615,c069a845-b843-475e-af4f-a27e379f1c76,1,GB||JE||IM||GG,False,2018-03-20,2016-03-09 13:10:05.142000,ACTIVE,GB,1967,PASSED,...,COMPLETED,2018-02-18 14:10:49.114,,,misc,c069a845-b843-475e-af4f-a27e379f1c76,TOPUP,HERA,17c4afb9-132b-4512-a4ff-2c1035cffd3b,3698.0
