In [1]:
import json
import csv

# Function to extract the data from JSON and save to CSV
def json_to_csv(json_path, csv_path):
    # Read the JSON file
    with open(json_path, "r") as file:
        data = json.load(file)

    # Extract the required data
    csv_data = []

    for item in data:
        _id = item.get('_id', '')
        conversations = item.get('conversations', [])
    
        for convo in conversations:
            match_id = convo.get('match_id', '')
            messages = convo.get('messages', [])
        
            for msg in messages:
                message_text = msg.get('message')
                # Ensure the message text is valid and not None
                if message_text:
                    csv_data.append([_id, match_id, message_text])

    # Write the data to a CSV file
    with open(csv_path, "w", newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(['_id', 'match_id', 'message'])
        # Write the data
        writer.writerows(csv_data)
# Usage
json_path = "tinder_json_og.json"
csv_output_path = "Tinder_Conversation_Data.csv"
json_to_csv(json_path, csv_output_path)

**Conversation Data - Cleaning**

In [2]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re 
import pandas as pd

pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('Tinder_Conversation_Data.csv')

In [4]:
df.head()

Unnamed: 0,_id,match_id,message
0,024610702baf540af5637873cd1534e9,Match 464,"Hello again, so now that we matched again, wha..."
1,024610702baf540af5637873cd1534e9,Match 463,"Most of the girls on this app scare me, but fo..."
2,024610702baf540af5637873cd1534e9,Match 463,"Dear diary, cute girl vanished... should I sen..."
3,024610702baf540af5637873cd1534e9,Match 462,"Most of the girls on this app scare me, but fo..."
4,024610702baf540af5637873cd1534e9,Match 462,"Well not many people have photos of dogs, and ..."


In [5]:
df.shape

(2209979, 3)

In [6]:
from collections import Counter
import string


# Extract the 'message' column and concatenate the text
text = " ".join(df['message'].dropna())

# Tokenize the text (split by whitespace and remove punctuation)
translator = str.maketrans("", "", string.punctuation)
tokens = text.translate(translator).lower().split()

# Count the occurrences of each word
word_counts = Counter(tokens)



In [7]:
# Get the top 50 words
top_10_words = word_counts.most_common(10)

top_10_df = pd.DataFrame(top_10_words, columns=['word', 'count'])
top_10_df

Unnamed: 0,word,count
0,i,468459
1,you,378310
2,a,325890
3,to,304424
4,the,246500
5,in,186071
6,and,184023
7,it,133189
8,of,129062
9,so,124408


In [8]:
import re

# Function to check if a character is an emoji
def is_emoji(s):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"  
                               "]+", flags=re.UNICODE)
    return re.search(emoji_pattern, s) is not None

# Extract emojis from the text
emojis = [char for char in text if is_emoji(char)]

# Count the occurrences of each emoji
emoji_counts = Counter(emojis)



In [9]:
top_emojis = emoji_counts.most_common(10)
top_emojis_df = pd.DataFrame(top_emojis, columns=['emoji', 'count'])
top_emojis_df

Unnamed: 0,emoji,count
0,😂,82073
1,😅,30137
2,️,27690
3,😉,25775
4,😊,25448
5,😁,18728
6,😄,17414
7,🤔,12664
8,🙂,12411
9,😍,11920


Let's add gender to our dataset.

In [10]:
df2 = pd.read_csv('Tinder_Data_v3_Clean_Edition.csv')

In [11]:
# Merge the gender column from df2 into df using the _id column
df = df.merge(df2[['gender', '_id']], on='_id', how='left')

In [12]:
df.tail()

Unnamed: 0,_id,match_id,message,gender
2209974,81a844f3889e3859b777ef2ea4a07bb0,Match 2,Muss jetzt jeden Tag kalorien zählen haha,M
2209975,81a844f3889e3859b777ef2ea4a07bb0,Match 2,Habe jetzt aber ein home gym,M
2209976,81a844f3889e3859b777ef2ea4a07bb0,Match 1,https&colon;&sol;&sol;media.tenor.com&sol;imag...,M
2209977,81a844f3889e3859b777ef2ea4a07bb0,Match 1,Hey!,M
2209978,81a844f3889e3859b777ef2ea4a07bb0,Match 1,Na wie gehts dir? 😃 ich war bis eben in der Bib,M


Calculating length of characters of first message sent. 

In [13]:
# Group by '_id' and 'match_id' and get the first message for each group
first_messages = df.groupby(['_id', 'match_id']).first().reset_index()

# Calculate the length of the first message for each group
first_messages['Opener'] = first_messages['message'].str.len()

# Merge the 'Opener' column back to the original df
df = df.merge(first_messages[['_id', 'match_id', 'Opener']], on=['_id', 'match_id'], how='left')

df.head()


Unnamed: 0,_id,match_id,message,gender,Opener
0,024610702baf540af5637873cd1534e9,Match 464,"Hello again, so now that we matched again, wha...",M,81
1,024610702baf540af5637873cd1534e9,Match 463,"Most of the girls on this app scare me, but fo...",M,119
2,024610702baf540af5637873cd1534e9,Match 463,"Dear diary, cute girl vanished... should I sen...",M,119
3,024610702baf540af5637873cd1534e9,Match 462,"Most of the girls on this app scare me, but fo...",M,121
4,024610702baf540af5637873cd1534e9,Match 462,"Well not many people have photos of dogs, and ...",M,121


In [15]:
df['Basic Opener'] = df['Opener'] <= 18



In [16]:
df.head()

Unnamed: 0,_id,match_id,message,gender,Opener,Basic Opener
0,024610702baf540af5637873cd1534e9,Match 464,"Hello again, so now that we matched again, wha...",M,81,False
1,024610702baf540af5637873cd1534e9,Match 463,"Most of the girls on this app scare me, but fo...",M,119,False
2,024610702baf540af5637873cd1534e9,Match 463,"Dear diary, cute girl vanished... should I sen...",M,119,False
3,024610702baf540af5637873cd1534e9,Match 462,"Most of the girls on this app scare me, but fo...",M,121,False
4,024610702baf540af5637873cd1534e9,Match 462,"Well not many people have photos of dogs, and ...",M,121,False


In [17]:
# Count the number of messages for each '_id' and 'match_id' group
conv_length = df.groupby(['_id', 'match_id']).size().reset_index(name='Conv Length')

# Merge the 'Conv Length' column back to the original df
df = df.merge(conv_length, on=['_id', 'match_id'], how='left')


In [18]:
df.head()

Unnamed: 0,_id,match_id,message,gender,Opener,Basic Opener,Conv Length
0,024610702baf540af5637873cd1534e9,Match 464,"Hello again, so now that we matched again, wha...",M,81,False,1
1,024610702baf540af5637873cd1534e9,Match 463,"Most of the girls on this app scare me, but fo...",M,119,False,2
2,024610702baf540af5637873cd1534e9,Match 463,"Dear diary, cute girl vanished... should I sen...",M,119,False,2
3,024610702baf540af5637873cd1534e9,Match 462,"Most of the girls on this app scare me, but fo...",M,121,False,4
4,024610702baf540af5637873cd1534e9,Match 462,"Well not many people have photos of dogs, and ...",M,121,False,4


In [19]:
data = df.drop_duplicates(subset='match_id', keep='first')


In [20]:
data['Gif Opener'] = data['message'].str.contains('giphy.com', case=False, na=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Gif Opener'] = data['message'].str.contains('giphy.com', case=False, na=False)


In [21]:
data['question'] = data['message'].str.endswith('?', na=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['question'] = data['message'].str.endswith('?', na=False)


In [22]:
data['Pickup Line'] = (~data['message'].str.endswith('?', na=False)) & (data['Opener'] >= 19)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Pickup Line'] = (~data['message'].str.endswith('?', na=False)) & (data['Opener'] >= 19)


In [23]:
data.head(50)

Unnamed: 0,_id,match_id,message,gender,Opener,Basic Opener,Conv Length,Gif Opener,question,Pickup Line
0,024610702baf540af5637873cd1534e9,Match 464,"Hello again, so now that we matched again, wha...",M,81,False,1,False,True,False
1,024610702baf540af5637873cd1534e9,Match 463,"Most of the girls on this app scare me, but fo...",M,119,False,2,False,False,True
3,024610702baf540af5637873cd1534e9,Match 462,"Most of the girls on this app scare me, but fo...",M,121,False,4,False,False,True
7,024610702baf540af5637873cd1534e9,Match 460,https&colon;&sol;&sol;media2.giphy.com&sol;med...,M,168,False,2,True,False,True
9,024610702baf540af5637873cd1534e9,Match 459,https&colon;&sol;&sol;media3.giphy.com&sol;med...,M,168,False,5,True,False,True
14,024610702baf540af5637873cd1534e9,Match 458,https&colon;&sol;&sol;media3.giphy.com&sol;med...,M,168,False,7,True,False,True
21,024610702baf540af5637873cd1534e9,Match 457,"Yeah, I was there for this summer",M,33,False,2,False,False,True
23,024610702baf540af5637873cd1534e9,Match 456,https&colon;&sol;&sol;media3.giphy.com&sol;med...,M,168,False,2,True,False,True
25,024610702baf540af5637873cd1534e9,Match 454,https&colon;&sol;&sol;media0.giphy.com&sol;med...,M,168,False,7,True,False,True
32,024610702baf540af5637873cd1534e9,Match 453,https&colon;&sol;&sol;media2.giphy.com&sol;med...,M,101,False,5,True,False,True


In [24]:
true_count = data['Basic Opener'].sum()
print(true_count)

6712


In [25]:
data.shape

(12464, 10)

In [26]:
#data.to_csv('Tinder_Conv_Data_v4.csv', index=False)