In [1]:
import re

import numpy as np
import pandas as pd

from googletrans import Translator

from sklearn.model_selection import train_test_split

# !pip install googletrans==3.1.0a0

# Scammer profiles

In [2]:
scammers_df = pd.read_csv('../data/interim/scam_profiles.csv')

In [3]:
# replace "–" with NaN
scammers_df.replace("–", np.nan, inplace=True)
scammers_df.head()

Unnamed: 0,username,name,age,location,ethnicity,occupation,status,phone,inet,email,description,messages,justifications
0,fredJ,Fred Millestone,49,"San Diego, California, United States",white,military,,,178.238.213.161,fred.millestone@yahoo.com,"I am a simple and easy going man,I do not like...",hello beauty can we chat? add me fred.millesto...,IP is a proxy\nIP doesn’t correspond location\...
1,donald200,Donald Steve,46,Ireland,white,Self Employed,widowed,widowed,164.82.146.3,donaldsteve200@yahoo.com,"Am smart, organized, intelligent, honest, cari...","How are you doing,I’m Donald from Ireland… am ...",IP is a proxy\nIP doesn’t correspond location\...
2,huddleman,Peter,45,"Arlington, or Andrews, or Dallas, Texas, Unite...",white,military,widowed,widowed,69.115.19.232,frankiess8928@gmail.com,"I am sweet, thoughtful, kind, intelligent, and...",,IP is a proxy\nIP doesn’t correspond location\...
3,jannycutelove,Jane Douglas,33,"Vancouver, Canada",white,student,single,single,67.191.26.153,janedouglas231@yahoo.com,"I am an attractive, physically and mentally he...",,IP is a proxy\nIP doesn’t correspond location\...
4,Richardholdy,David Holdridge,47,"Bedford, Pennsylvania, United states",white,military,widowed,widowed,199.15.250.244,david_holdy@yahoo.com,"I am a loyal,friendly ,and always supportive ....",Its really a pleasure to hear from back from u...,IP is a proxy\nIP doesn’t correspond location\...


In [4]:
# drop non overlapping columns
scammers_df = scammers_df.drop(["username", "name", "phone", "inet", "email", "messages", "justifications"], axis=1)
scammers_df = scammers_df.dropna().reset_index(drop=True)

In [5]:
# add "scam" feature
scammers_df.loc[:, "scam"] = 1

# Real profiles

In [6]:
real_df = pd.read_csv('../data/interim/real_profiles.csv')

In [7]:
real_df.replace("-", np.nan, inplace=True)
real_df.head()

Unnamed: 0,gender,age,location,status,username,ethnicity,occupation,description,match_age,children,orientation,religion,smoking,drinking,intent
0,male,50 y.o.,"East Chicago, IN, USA",widowed,Royer6873,hispanic,,,from 19 to 86,want children,Straight,Other,non-smoker,occasional drinker,Serious Relationship
1,female,42 y.o.,"Mykolaiv, Mykolaiv Oblast, Ukraine, 54000",single,uaola1981,white,,,from 19 to 75,no children,Straight,Other,non-smoker,social drinker,"Friendship, Romance, Serious Relationship, Mar..."
2,female,58 y.o.,"Tulsa, OK, USA",single,Dessarono55,black,Logistics,,from 50 to 60,1-2 living elsewhere,Straight,Spiritual,non-smoker,social drinker,"Serious Relationship, Marriage"
3,male,60 y.o.,"Algiers [El Djazaïr], Algeria",single,maliktabib1964,white,,,from 19 to 50,no children,Straight,Muslim,non-smoker,never,Friendship
4,male,28 y.o.,"Santo Domingo de los Tsáchilas, Ecuador",single,Jaron26,hispanic,Ingeniero Agropecuario,,from 25 to 30,no children,Straight,Christian,non-smoker,never,"Fun, Friendship, Romance, Serious Relationship..."


In [8]:
# extract just the age from the age column
real_df["age"] = real_df["age"].str.extract(r'(\d+)')

In [9]:
# drop non overlapping columns
real_df = real_df.drop(["gender", "username", "children", "orientation", "religion", "smoking", "drinking", "intent", "match_age"], axis=1)

In [10]:
# Add "scam" feature
real_df.loc[:, "scam"] = 0

# Combine dataframes

In [11]:
df_combined = pd.concat([scammers_df, real_df], ignore_index=True)
df_combined = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
df_combined = df_combined.dropna().reset_index(drop=True)
df_combined

Unnamed: 0,age,location,ethnicity,occupation,status,description,scam
0,45,"New York, United states",white,construct engineering,divorced,"I’m balanced: secure enough to be vulnerable, ...",1
1,22,"Debry, United Kingdom",mixed,self employed,single,easygoing young girl looking for a nice partner,1
2,49,"Providence, Utah, United States",white,military,divorced,I enjoy a variety of things. I am a 49 years o...,1
3,48,"Castrop-Rauxel, Germany, or New York, or Los A...",white,banker,widowed,I am an optimistic person who has different in...,1
4,48,"Miami, Florida, United States",white,business woman,widowed,"am a gentle woman, i love going to church, wil...",1
...,...,...,...,...,...,...,...
6238,65,"South Carolina, United States",white,military,single,My name is Frank j Grass from Anorld Missouri ...,1
6239,53,"Austin, Texas, United States",mixed,construction,divorced,I am michalis from Greece living and working a...,1
6240,50,"London, United Kingdom",mixed,contractor,widowed,I believe that you should treat others the way...,1
6241,37,"Durban, South Africa",black,Police man,single,I love being with my women n spend time with her,0


In [12]:
# find statuses that are only present once
df_combined.loc[:, "status"] = df_combined["status"].apply(lambda x : x.lower())
statuses = np.asarray(df_combined["status"].value_counts().index)
idx_to_remove = np.where(np.asarray(df_combined["status"].value_counts().where(lambda x : x == 1, 0)))[0]
status_to_remove = statuses[idx_to_remove]

# remove statuses that only have 1 count
df_combined = df_combined[~df_combined["status"].isin(status_to_remove)]
df_combined = df_combined[df_combined["status"] != "single or divorced"]

# check counts
df_combined["status"].value_counts()

status
single             3466
divorced           1186
widowed            1091
separated           381
married              51
in relationship      40
widower              10
widow                 4
Name: count, dtype: int64

In [13]:
# standardize status
df_combined.loc[:, "status"] = df_combined["status"].apply(lambda x : "widowed" if x == "widower" else x)
df_combined.loc[:, "status"] = df_combined["status"].apply(lambda x : "widowed" if x == "widow" else x)

In [14]:
# drop rows in scammers_df if "age" column contains more than just numbers
df_combined = df_combined[df_combined["age"].apply(lambda x : re.match("^\d+$", str(x)) is not None)]

# convert "age" column to integer
df_combined.loc[:, "age"] = df_combined["age"].astype(int)

# bin the ages value in scam dataframe
bins = [0, 20, 30, 40, 50, 60, 70, 80, 90, 100]
labels = ['0-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100']
df_combined['age_group'] = pd.cut(df_combined['age'], bins=bins, labels=labels, right=False)
df_combined['age_group'] = df_combined['age_group'].astype('object')

In [15]:
# extract country
df_combined['country'] = df_combined['location'].apply(lambda x: x.split(',')[-1].strip())

# Additional work to make the data more readable

Translating Descriptions to English 

In [16]:
import time
import pandas as pd
from requests.exceptions import ReadTimeout

def translate_text(text, target_language='en'):
    translator = Translator()
    translated_text = translator.translate(text, dest=target_language)
    return translated_text.text

def translate_chunk(chunk, target_language='en'):
    print(chunk)
    translated_chunk = chunk.apply(lambda x: translate_text(x, target_language))
    return translated_chunk

def translate_column(df, columns_to_translate, target_language='en', chunk_size=100, pause_duration=5, max_retries=3):
    for column_name in columns_to_translate:
        num_rows = len(df)
        for i in range(0, num_rows, chunk_size):
            retries = 0
            while retries < max_retries:
                try:
                    chunk = df.iloc[i:i+chunk_size][column_name]
                    translated_chunk = translate_chunk(chunk, target_language)
                    df.iloc[i:i+chunk_size, df.columns.get_loc(column_name)] = translated_chunk
                    print(f"Translated {len(translated_chunk)} rows. Pausing for {pause_duration} seconds...")
                    time.sleep(pause_duration)
                    break  # Exit the retry loop if translation succeeds
                except ReadTimeout:
                    print(f"Timeout error occurred. Retrying...")
                    retries += 1
            else:
                print(f"Max retries reached. Skipping chunk.")
                continue

    # Save the translated dataframe to a new CSV file
    # df.to_csv(output_csv_path, index=False)
    return df

In [17]:
df_combined = translate_column(df_combined, ["description"])

0      I’m balanced: secure enough to be vulnerable, ...
1        easygoing young girl looking for a nice partner
2      I enjoy a variety of things. I am a 49 years o...
3      I am an optimistic person who has different in...
4      am a gentle woman, i love going to church, wil...
                             ...                        
100                 Alegre, sincero, hogareño, romantico
101    My friends consider me to be an affectionate, ...
102    I’m simple, kind, loyal, serious, focus, depen...
103    Cariñoso,tranquilo, entregado a mi pareja,resp...
104    Hola, soy tranquilo, educado, amigable, educac...
Name: description, Length: 100, dtype: object


Translated 100 rows. Pausing for 5 seconds...
105    God Fearing & Disciplined/Oriented Person. I a...
106    Soy una persona alegre trabajadora y emprendedora
107    am an easy going man, good to be with,sociable...
108    Me gusta cocinar y la musica me encanta bailar...
109    am steve johnson am searching for a good and h...
                             ...                        
201    I’m open, sensual, kind, gentle, romantic…I’m ...
202    Me gusta pasarla bien, disfrutar buenos moment...
203    I would rather describe myself as a straight p...
204    I want a serious woman who want to be married....
205    Soy una persona sencilla, busco un compañero, ...
Name: description, Length: 100, dtype: object
Translated 100 rows. Pausing for 5 seconds...
206    Me gusta conocer nuevas culturas , compartir c...
207    I am easy-going and enjoy doing just about any...
208    cumplo mis metas, soy muy alegre, soy una chic...
209              Me gustan mayores, quiero pasarla rico.
210    

# Scaling down the categories

Employment type

In [None]:
def employ(level):
    level = level.lower()
    if any(x in level for x in ['army', 'milit', 'marine', 'soldier', 'captain', 'general', 'solda', 'force']):
        return "military"
    elif any(x in level for x in ['stud?ent', 'studi', 'coll']):
        return "student"
    elif any(x in level for x in ['self', 'own', 'independ', 'entre', 'freelanc', 'propia', 'autonomo']):
        return "self-employed"
    elif any(x in level for x in ['engin', 'ingenier', 'mechanic', 'mecanic', 'automot']):
        return "engineering"
    elif any(x in level for x in ['gover', 'civil', 'public', '^un[$ ]']):
        return "government"
    elif any(x in level for x in ['academ', 'profes', 'research', 'lectur', 'universi', 'ologist', 'phd']):
        return "academic"
    elif any(x in level for x in ['nurs', 'enfermer', 'care', 'trainer', 'nanny', 'baby', 'niñera', 'social']):
        return "carer"
    elif any(x in level for x in ['construc', 'carpent', 'roof', 'build', 'survey', 'ass?es', 'crane', 'equipment']):
        return "construction"
    elif any(x in level for x in ['secur', 'detect', 'polic', 'investig', 'guard', 'custod', 'correct']):
        return "security"
    elif any(x in level for x in ['econom', 'analy']):
        return "analyst"
    elif any(x in level for x in ['farm', 'agri']):
        return "agriculture"
    elif any(x in level for x in ['sail', 'sea', 'fish']):
        return "naval"
    elif any(x in level for x in ['weld', 'factory', 'manufact', 'machin', 'industr']):
        return "manufacturing"
    elif any(x in level for x in ['tech', 'inform', '^it[$ ]', 'telecom', 'téch', 'software', 'sistem', 'system', 'tecnico', 'técnico', 'program', 'network', 'comput', 'electro', 'teck', 'develop']):
        return "technology"
    elif any(x in level for x in ['retail', 'comerci', 'shop', 'clerk', 'store', 'wait', 'vend', 'sell', 'cashier', 'assist', 'tender', 'customer', 'asist', 'mesero', 'restaur', 'camarer']):
        return "service"
    elif any(x in level for x in ['tour', 'holiday', 'vacat', 'steward', 'flight', 'travel', 'turis', 'hotel']):
        return "tourism"
    elif any(x in level for x in ['sale', 'market', 'ventas']):
        return "sales"
    elif any(x in level for x in ['writ', 'journal', 'period']):
        return "writing"
    elif any(x in level for x in ['handy', 'repair', 'repare', 'maint', 'plumb', 'electr', 'manteni', 'hvac']):
        return "repair"
    elif 'estat' in level:
        return "real estate"
    elif any(x in level for x in ['teach', 'educa', 'docen', 'maestr', 'lehr']):
        return "teacher"
    elif any(x in level for x in ['manag', 'supervis']):
        return "manager"
    elif 'contra' in level:
        return "contractor"
    elif any(x in level for x in ['ama de casa', 'wife', 'mother', 'mom', 'home', 'hogar']):
        return "housewife"
    elif any(x in level for x in ['unemploy', 'desempl', 'not work']):
        return "unemployed"
    elif any(x in level for x in ['financ', 'bank', 'insur', 'trad', 'negoci', 'cajero']):
        return "finance"
    elif any(x in level for x in ['chef', 'cook', 'bake', 'co[cs]iner', 'hospitali', 'food']):
        return "hospitality"
    elif any(x in level for x in ['secret', 'admin', 'recep', 'office', 'human resources', 'clerical', 'profec', 'entry']):
        return "clerical"
    elif any(x in level for x in ['driver', 'transport', 'deliver', 'ship', 'chofer', 'pilot', 'logist', 'cargo']):
        return "transport"
    elif any(x in level for x in ['housekeep', 'clean', 'limpi', 'janitor']):
        return "cleaner"
    elif any(x in level for x in ['architec', 'arquitec']):
        return "architect"
    elif any(x in level for x in ['account', 'contad']):
        return "accounting"
    elif any(x in level for x in ['law', 'judge', 'solicitor', 'barrister', 'legal', 'attorney', 'abogad']):
        return "legal"
    elif any(x in level for x in ['music', 'sport', 'play', 'produc', 'músico', 'deporti', 'conduc', 'soccer']):
        return "entertainment"
    elif any(x in level for x in ['artist', 'art', 'paint', 'sculpt', 'boutique', 'photo', 'foto', 'choreo']):
        return "artist"
    elif any(x in level for x in ['jewel', 'antiq', 'print']):
        return "specialist"
    elif any(x in level for x in ['doctor', 'physic', 'ician', 'medic', 'psicolog', 'terap', 'therap', 'salud', 'health', 'médic', 'surgeon', 'denti', 'pharma']):
        return "medical"
    elif any(x in level for x in ['beaut', 'styl', 'estili', 'peluquer', 'hair', 'salon', 'manic']):
        return "beauty"
    elif any(x in level for x in ['fashion', 'model']):
        return "fashion"
    elif any(x in level for x in ['design', 'decor', 'flower', 'desiñ', 'deisñ']):
        return "designer"
    elif any(x in level for x in ['warehouse', 'work', 'opera', 'obrer', 'labor', 'labour', 'landscap', 'mining', 'mine', 'load', 'trabajo', 'pack', 'foreman']):
        return "manual"
    elif any(x in level for x in ['bus[iy]?nes', 'empresa', 'execut', 'direct', 'ceo', 'ejecut']):
        return "business"
    elif 'consult' in level:
        return "consultant"
    elif any(x in level for x in ['retir', 'jubilad', 'pension']):
        return "retired"
    elif 'disab' in level:
        return "disabled"
    elif pd.isnull(level):
        return None
    else:
        return "other"

Ethnic type

In [None]:
def ethnise(level):
    if level.lower() not in ['asian', 'black', 'hispanic', 'middle eastern', 'mixed', 'native american', 'pacific islander', 'white']:
        return "other"
    else:
        return level

In [None]:
df_combined.loc[:, "occupation"] = df_combined["occupation"].apply(employ)
df_combined.loc[:, "ethnicity"] = df_combined["ethnicity"].apply(ethnise)

In [None]:
df_combined.dtypes
cols_to_categorical = ['ethnicity', 'occupation', 'status', 'country', 'age_group']
df_combined[cols_to_categorical] = df_combined[cols_to_categorical].astype('category')

df_combined.dtypes

age              object
location         object
ethnicity      category
occupation     category
status         category
description      object
scam              int64
age_group      category
country        category
dtype: object

In [None]:
df_combined.to_csv('../data/interim/combined_profiles.csv', index=False)

# Split into train and test

In [None]:
df_all = pd.read_excel("../data/interim/combined_profiles_sorted.xlsx")

In [None]:
df_all

Unnamed: 0,age,location,ethnicity,occupation,status,description,scam,age_group,country
0,35,"Kabul, Afghanistan",native american,military,single,I love to laugh and to enjoy the simple things...,1,31-40,Afghanistan
1,41,"Phoenix, Arizona, United States or Kabul, Afgh...",white,military,divorced,"I?€?m a honest, loving, and trust worthy man l...",1,41-50,Afghanistan
2,46,"Charlotte, North Carolina, United States, or K...",native american,military,widowed,My heart is like an open book it?€?s depend on...,1,41-50,Afghanistan
3,49,"Rockville, Maryland, United States, or Kabul, ...",white,government,widowed,"Hello,i am here to look for a good relationshi...",1,41-50,Afghanistan
4,47,"Kabul, Afghanistan",white,military,widowed,"My name is Jeffery Yates, I am 47 years old an...",1,41-50,Afghanistan
...,...,...,...,...,...,...,...,...,...
5964,30,"Ho Chi Minh City, Ho Chi Minh, Vietnam",asian,self-employed,single,"An open-minded, friendly and helpful girl! A t...",0,31-40,Vietnam
5965,28,"Zabid, Yemen",white,military,single,I am single definitely no kid. My personality ...,1,21-30,Yemen
5966,61,"Mpika, Zambia",white,self-employed,separated,"Loves life with plenty of energy, zeal and ent...",0,61-70,Zambia
5967,55,"Lusaka, Zambia",black,artist,separated,"easy going, adventurous, creative. Loves to so...",0,51-60,Zambia


In [None]:
train_df, test_df = train_test_split(df_all, test_size=0.2, random_state=42)

In [None]:
train_df.to_csv('../data/processed/train_profiles.csv', index=False)
test_df.to_csv('../data/processed/test_profiles.csv', index=False)