In [1]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re 
import pandas as pd
import json

In [2]:
# Read the JSON file
json_path = "tinder_json_og.json"
with open(json_path, 'r') as file:
    data = json.load(file)

# Extract the required information
rows = []
for item in data:
    _id = item["_id"]
    app_opens = item["appOpens"]
    sum_app_opens = sum(app_opens.values())
    no_of_days = len(app_opens)
    conversations_meta = item["conversationsMeta"]
    row = (
        _id,
        sum_app_opens,
        no_of_days,
        conversations_meta["nrOfConversations"],
        conversations_meta["longestConversation"],
        conversations_meta["longestConversationInDays"],
        conversations_meta["averageConversationLength"],
        conversations_meta["averageConversationLengthInDays"],
        conversations_meta["medianConversationLength"],
        conversations_meta["medianConversationLengthInDays"],
        conversations_meta["nrOfOneMessageConversations"],
        conversations_meta["percentOfOneMessageConversations"],
        conversations_meta["nrOfGhostingsAfterInitialMessage"],
    )
    rows.append(row)

# Define the columns
columns = [
    "_id",
    "sum_app_opens",
    "no_of_days",
    "nrOfConversations",
    "longestConversation",
    "longestConversationInDays",
    "averageConversationLength",
    "averageConversationLengthInDays",
    "medianConversationLength",
    "medianConversationLengthInDays",
    "nrOfOneMessageConversations",
    "percentOfOneMessageConversations",
    "nrOfGhostingsAfterInitialMessage",
]

# Create a DataFrame with the extracted information
df = pd.DataFrame(rows, columns=columns)



In [3]:
df.head()

Unnamed: 0,_id,sum_app_opens,no_of_days,nrOfConversations,longestConversation,longestConversationInDays,averageConversationLength,averageConversationLengthInDays,medianConversationLength,medianConversationLengthInDays,nrOfOneMessageConversations,percentOfOneMessageConversations,nrOfGhostingsAfterInitialMessage
0,00b74e27ad1cbb2ded8e907fcc49eaaf,6839,477,739,133,683.557442,8.560217,10.23662,3,0.081134,226,30.581867,66
1,024610702baf540af5637873cd1534e9,26280,1197,464,36,622.466528,4.49569,3.97398,2,6.9e-05,56,12.068966,16
2,0a5e3dd8489fe67485ddb7d6adb26ebd,3196,249,303,33,295.283414,4.254125,3.465479,2,0.033715,106,34.983498,17
3,048dd37565ad9cbc24c163ffedffbf58,2077,158,47,28,130.03441,7.893617,5.878985,6,0.643727,3,6.382979,0
4,0eb998fdde77f9c123c07eace18a5cc1,11946,715,809,444,198.390972,6.824475,1.924996,3,0.091771,296,36.588381,13


In [4]:
# Function to calculate no_of_days and no_of_matches
def calculate_matches(data):
    rows = []
    for item in data:
        _id = item["_id"]
        matches = item["matches"]
        no_of_days = len(matches)
        no_of_matches = sum(matches.values())
        row = (_id, no_of_days, no_of_matches)
        rows.append(row)
    return rows

# Read the JSON file
json_path = "tinder_json_og.json"
with open(json_path, 'r') as file:
    data = json.load(file)

# Calculate no_of_days and no_of_matches
matches_rows = calculate_matches(data)

# Define the columns
matches_columns = ["_id", "no_of_days", "no_of_matches"]

# Create a DataFrame with the extracted information
matches_df = pd.DataFrame(matches_rows, columns=matches_columns)

# Merge with the existing DataFrame
final_df = pd.merge(df, matches_df, on="_id", how="left")




In [5]:
final_df.head()

Unnamed: 0,_id,sum_app_opens,no_of_days_x,nrOfConversations,longestConversation,longestConversationInDays,averageConversationLength,averageConversationLengthInDays,medianConversationLength,medianConversationLengthInDays,nrOfOneMessageConversations,percentOfOneMessageConversations,nrOfGhostingsAfterInitialMessage,no_of_days_y,no_of_matches
0,00b74e27ad1cbb2ded8e907fcc49eaaf,6839,477,739,133,683.557442,8.560217,10.23662,3,0.081134,226,30.581867,66,477,3408
1,024610702baf540af5637873cd1534e9,26280,1197,464,36,622.466528,4.49569,3.97398,2,6.9e-05,56,12.068966,16,1197,888
2,0a5e3dd8489fe67485ddb7d6adb26ebd,3196,249,303,33,295.283414,4.254125,3.465479,2,0.033715,106,34.983498,17,249,548
3,048dd37565ad9cbc24c163ffedffbf58,2077,158,47,28,130.03441,7.893617,5.878985,6,0.643727,3,6.382979,0,158,94
4,0eb998fdde77f9c123c07eace18a5cc1,11946,715,809,444,198.390972,6.824475,1.924996,3,0.091771,296,36.588381,13,715,1905


In [6]:
# Function to calculate no_of_msgs_sent and no_of_msgs_received
def calculate_messages(data):
    rows = []
    for item in data:
        _id = item["_id"]
        messages = item["messages"]
        no_of_msgs_sent = sum(messages["sent"].values())
        no_of_msgs_received = sum(messages["received"].values())
        row = (_id, no_of_msgs_sent, no_of_msgs_received)
        rows.append(row)
    return rows

# Calculate no_of_msgs_sent and no_of_msgs_received
messages_rows = calculate_messages(data)

# Define the columns
messages_columns = ["_id", "no_of_msgs_sent", "no_of_msgs_received"]

# Create a DataFrame with the extracted information
messages_df = pd.DataFrame(messages_rows, columns=messages_columns)

# Merge with the existing DataFrame
final_df = pd.merge(final_df, messages_df, on="_id", how="left")

In [7]:
final_df.head()

Unnamed: 0,_id,sum_app_opens,no_of_days_x,nrOfConversations,longestConversation,longestConversationInDays,averageConversationLength,averageConversationLengthInDays,medianConversationLength,medianConversationLengthInDays,nrOfOneMessageConversations,percentOfOneMessageConversations,nrOfGhostingsAfterInitialMessage,no_of_days_y,no_of_matches,no_of_msgs_sent,no_of_msgs_received
0,00b74e27ad1cbb2ded8e907fcc49eaaf,6839,477,739,133,683.557442,8.560217,10.23662,3,0.081134,226,30.581867,66,477,3408,3360,3261
1,024610702baf540af5637873cd1534e9,26280,1197,464,36,622.466528,4.49569,3.97398,2,6.9e-05,56,12.068966,16,1197,888,2088,1509
2,0a5e3dd8489fe67485ddb7d6adb26ebd,3196,249,303,33,295.283414,4.254125,3.465479,2,0.033715,106,34.983498,17,249,548,1291,1152
3,048dd37565ad9cbc24c163ffedffbf58,2077,158,47,28,130.03441,7.893617,5.878985,6,0.643727,3,6.382979,0,158,94,371,224
4,0eb998fdde77f9c123c07eace18a5cc1,11946,715,809,444,198.390972,6.824475,1.924996,3,0.091771,296,36.588381,13,715,1905,5527,6052


In [8]:
# Function to calculate swipe_likes and swipe_passes
def calculate_swipes(data):
    rows = []
    for item in data:
        _id = item["_id"]
        swipe_likes = item["swipeLikes"]
        swipe_passes = item["swipePasses"]
        total_swipe_likes = sum(swipe_likes.values())
        total_swipe_passes = sum(swipe_passes.values())
        row = (_id, total_swipe_likes, total_swipe_passes)
        rows.append(row)
    return rows

# Calculate swipe_likes and swipe_passes
swipes_rows = calculate_swipes(data)

# Define the columns
swipes_columns = ["_id", "swipe_likes", "swipe_passes"]

# Create a DataFrame with the extracted information
swipes_df = pd.DataFrame(swipes_rows, columns=swipes_columns)

# Merge with the existing DataFrame
final_df = pd.merge(final_df, swipes_df, on="_id", how="left")



In [9]:
# Print the first few rows
final_df.head()

Unnamed: 0,_id,sum_app_opens,no_of_days_x,nrOfConversations,longestConversation,longestConversationInDays,averageConversationLength,averageConversationLengthInDays,medianConversationLength,medianConversationLengthInDays,nrOfOneMessageConversations,percentOfOneMessageConversations,nrOfGhostingsAfterInitialMessage,no_of_days_y,no_of_matches,no_of_msgs_sent,no_of_msgs_received,swipe_likes,swipe_passes
0,00b74e27ad1cbb2ded8e907fcc49eaaf,6839,477,739,133,683.557442,8.560217,10.23662,3,0.081134,226,30.581867,66,477,3408,3360,3261,23341,24229
1,024610702baf540af5637873cd1534e9,26280,1197,464,36,622.466528,4.49569,3.97398,2,6.9e-05,56,12.068966,16,1197,888,2088,1509,45068,49559
2,0a5e3dd8489fe67485ddb7d6adb26ebd,3196,249,303,33,295.283414,4.254125,3.465479,2,0.033715,106,34.983498,17,249,548,1291,1152,6679,5500
3,048dd37565ad9cbc24c163ffedffbf58,2077,158,47,28,130.03441,7.893617,5.878985,6,0.643727,3,6.382979,0,158,94,371,224,6724,7899
4,0eb998fdde77f9c123c07eace18a5cc1,11946,715,809,444,198.390972,6.824475,1.924996,3,0.091771,296,36.588381,13,715,1905,5527,6052,60169,96673


In [14]:
# Function to extract user information
def extract_user_info(data):
    rows = []
    for item in data:
        _id = item["_id"]
        user = item["user"]
        birth_date = user.get("birthDate", None)
        age_filter_min = user.get("ageFilterMin", None)
        age_filter_max = user.get("ageFilterMax", None)
        city_name = user.get("cityName", None)
        country = user.get("country", None)
        create_date = user.get("createDate", None)
        education = user.get("education", None)
        gender = user.get("gender", None)
        interested_in = user.get("interestedIn", None)
        gender_filter = user.get("genderFilter", None)
        instagram = user.get("instagram", None)
        spotify = user.get("spotify", None)
        jobs = user.get("jobs", [])
        job_title = jobs[0].get("title", None) if jobs else None
        education_level = user.get("educationLevel", None)
        
        row = (_id, birth_date, age_filter_min, age_filter_max, city_name, country, create_date, education, gender, interested_in, gender_filter, instagram, spotify, job_title, education_level)
        rows.append(row)
    return rows

# Extract user information
user_rows = extract_user_info(data)

# Define the columns
user_columns = ["_id", "birthDate", "ageFilterMin", "ageFilterMax", "cityName", "country", "createDate", "education", "gender", "interestedIn", "genderFilter", "instagram", "spotify", "jobTitle", "educationLevel"]

# Create a DataFrame with the extracted information
user_df = pd.DataFrame(user_rows, columns=user_columns)


In [15]:
# Merge with the existing DataFrame
final_df = pd.merge(final_df, user_df, on="_id", how="left")

In [16]:
# Print the first few rows
final_df.head()

Unnamed: 0,_id,sum_app_opens,no_of_days_x,nrOfConversations,longestConversation,longestConversationInDays,averageConversationLength,averageConversationLengthInDays,medianConversationLength,medianConversationLengthInDays,...,country,createDate,education,gender,interestedIn,genderFilter,instagram,spotify,jobTitle,educationLevel
0,00b74e27ad1cbb2ded8e907fcc49eaaf,6839,477,739,133,683.557442,8.560217,10.23662,3,0.081134,...,Norway,2016-01-01T09:30:07.551Z,Has high school and/or college education,M,F,F,False,False,,Has high school and/or college education
1,024610702baf540af5637873cd1534e9,26280,1197,464,36,622.466528,4.49569,3.97398,2,6.9e-05,...,VA,2016-07-12T02:26:46.774Z,Has high school and/or college education,M,F,F,False,False,,Has high school and/or college education
2,0a5e3dd8489fe67485ddb7d6adb26ebd,3196,249,303,33,295.283414,4.254125,3.465479,2,0.033715,...,,2019-07-01T19:17:54.560Z,Has no high school or college education,M,F,F,False,False,,Has no high school or college education
3,048dd37565ad9cbc24c163ffedffbf58,2077,158,47,28,130.03441,7.893617,5.878985,6,0.643727,...,Alberta,2019-09-25T03:28:20.920Z,Has no high school or college education,M,F,F,False,False,,Has no high school or college education
4,0eb998fdde77f9c123c07eace18a5cc1,11946,715,809,444,198.390972,6.824475,1.924996,3,0.091771,...,,2017-11-17T23:30:37.231Z,Has no high school or college education,M,F,F,True,False,Research Assistant,Has no high school or college education


In [17]:
df = final_df.copy()

In [18]:
df.shape

(1209, 33)

In [19]:
df.to_csv('Tinder_Data_v2.csv', index=False)
