In [22]:
# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# ------------------------------------------------------------------------------
from scipy.stats import shapiro, kstest

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [40]:
df = pd.read_csv("Dataset_con_Localizaci_n_Madrid.csv")

df.head()


Unnamed: 0,gender,sexual_orientation,location_type,income_bracket,education_level,interest_tags,app_usage_time_min,app_usage_time_label,swipe_right_ratio,swipe_right_label,likes_received,mutual_matches,profile_pics_count,bio_length,message_sent_count,emoji_usage_rate,last_active_hour,swipe_time_of_day,match_outcome,madrid_location
0,Prefer Not to Say,Gay,Urban,High,Bachelor’s,"Fitness, Politics, Traveling",52,Moderate,0.6,Optimistic,173,23,4,44,75,0.36,13,Early Morning,Mutual Match,Chamberí
1,Male,Bisexual,Suburban,Upper-Middle,No Formal Education,"Languages, Fashion, Parenting",279,Extreme User,0.56,Optimistic,107,7,3,301,35,0.42,0,Morning,Chat Ignored,Las Rozas
2,Non-binary,Pansexual,Suburban,Low,Master’s,"Movies, Reading, DIY",49,Moderate,0.41,Optimistic,91,27,2,309,33,0.41,1,After Midnight,Date Happened,Parla
3,Genderfluid,Gay,Metro,Very Low,Postdoc,"Coding, Podcasts, History",185,Extreme User,0.32,Balanced,147,6,5,35,5,0.07,21,Morning,No Action,Lavapiés
4,Male,Bisexual,Urban,Middle,Bachelor’s,"Clubbing, Podcasts, Cars",83,High,0.32,Balanced,94,11,1,343,34,0.11,22,After Midnight,One-sided Like,Usera


In [41]:
# reviso el tamaño del df

print(f"El número de filas que tenemos es {df.shape[0]}, y el número de columnas es {df.shape[1]}")

El número de filas que tenemos es 50000, y el número de columnas es 20


In [42]:
df["gender"].unique()

array(['Prefer Not to Say', 'Male', 'Non-binary', 'Genderfluid', 'Female',
       'Transgender'], dtype=object)

In [43]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
app_usage_time_min,50000.0,149.9124,86.990521,0.0,74.0,150.0,225.0,300.0
swipe_right_ratio,50000.0,0.500655,0.197468,0.0,0.37,0.5,0.64,1.0
likes_received,50000.0,99.52604,57.996799,0.0,49.0,100.0,150.0,200.0
mutual_matches,50000.0,13.87028,9.105615,0.0,6.0,13.0,22.0,30.0
profile_pics_count,50000.0,2.98772,1.99678,0.0,1.0,3.0,5.0,6.0
bio_length,50000.0,250.1744,144.800996,0.0,125.0,250.0,376.0,500.0
message_sent_count,50000.0,50.07194,29.168,0.0,25.0,50.0,75.0,100.0
emoji_usage_rate,50000.0,0.286205,0.160042,0.0,0.16,0.27,0.39,0.94
last_active_hour,50000.0,11.5218,6.920474,0.0,5.0,12.0,18.0,23.0


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   gender                50000 non-null  object 
 1   sexual_orientation    50000 non-null  object 
 2   location_type         50000 non-null  object 
 3   income_bracket        50000 non-null  object 
 4   education_level       50000 non-null  object 
 5   interest_tags         50000 non-null  object 
 6   app_usage_time_min    50000 non-null  int64  
 7   app_usage_time_label  50000 non-null  object 
 8   swipe_right_ratio     50000 non-null  float64
 9   swipe_right_label     50000 non-null  object 
 10  likes_received        50000 non-null  int64  
 11  mutual_matches        50000 non-null  int64  
 12  profile_pics_count    50000 non-null  int64  
 13  bio_length            50000 non-null  int64  
 14  message_sent_count    50000 non-null  int64  
 15  emoji_usage_rate   

In [45]:
df.duplicated().sum()

np.int64(0)

In [46]:
df.nunique()

gender                      6
sexual_orientation          8
location_type               6
income_bracket              7
education_level             9
interest_tags           40206
app_usage_time_min        301
app_usage_time_label        7
swipe_right_ratio         101
swipe_right_label           4
likes_received            201
mutual_matches             31
profile_pics_count          7
bio_length                501
message_sent_count        101
emoji_usage_rate           93
last_active_hour           24
swipe_time_of_day           6
match_outcome              10
madrid_location            21
dtype: int64

In [47]:
df.isna().sum() / df.shape[0] *100

gender                  0.0
sexual_orientation      0.0
location_type           0.0
income_bracket          0.0
education_level         0.0
interest_tags           0.0
app_usage_time_min      0.0
app_usage_time_label    0.0
swipe_right_ratio       0.0
swipe_right_label       0.0
likes_received          0.0
mutual_matches          0.0
profile_pics_count      0.0
bio_length              0.0
message_sent_count      0.0
emoji_usage_rate        0.0
last_active_hour        0.0
swipe_time_of_day       0.0
match_outcome           0.0
madrid_location         0.0
dtype: float64

In [48]:
duplicados = df.duplicated().sum()
print(f"Filas duplicadas encontradas: {duplicados}")

Filas duplicadas encontradas: 0


In [49]:
for columna in df:
    print(f'La columna {columna} tiene estos valores únicos:')
    print(df[columna].unique())
    print('----------------------------')

La columna gender tiene estos valores únicos:
['Prefer Not to Say' 'Male' 'Non-binary' 'Genderfluid' 'Female'
 'Transgender']
----------------------------
La columna sexual_orientation tiene estos valores únicos:
['Gay' 'Bisexual' 'Pansexual' 'Lesbian' 'Asexual' 'Queer' 'Straight'
 'Demisexual']
----------------------------
La columna location_type tiene estos valores únicos:
['Urban' 'Suburban' 'Metro' 'Small Town' 'Remote Area' 'Rural']
----------------------------
La columna income_bracket tiene estos valores únicos:
['High' 'Upper-Middle' 'Low' 'Very Low' 'Middle' 'Lower-Middle'
 'Very High']
----------------------------
La columna education_level tiene estos valores únicos:
['Bachelor’s' 'No Formal Education' 'Master’s' 'Postdoc' 'Associate’s'
 'High School' 'Diploma' 'PhD' 'MBA']
----------------------------
La columna interest_tags tiene estos valores únicos:
['Fitness, Politics, Traveling' 'Languages, Fashion, Parenting'
 'Movies, Reading, DIY' ... 'Traveling, Fitness, Stand-up

In [50]:
# Detecto las columnas objects:

col_objects = df.select_dtypes(include='object').columns.tolist()

col_objects

['gender',
 'sexual_orientation',
 'location_type',
 'income_bracket',
 'education_level',
 'interest_tags',
 'app_usage_time_label',
 'swipe_right_label',
 'swipe_time_of_day',
 'match_outcome',
 'madrid_location']

In [51]:
# hago una funcion de limpieza de texto.

def limpiar_texto(col):
    return col.str.strip().str.title() # .strip -> elimino espacios  // paso a Title para respetar los nombres propios.

In [52]:
# aplico la funcion a mis columnas categoricas.

for col in col_objects:
    df[col] = limpiar_texto(df[col])

col_objects

['gender',
 'sexual_orientation',
 'location_type',
 'income_bracket',
 'education_level',
 'interest_tags',
 'app_usage_time_label',
 'swipe_right_label',
 'swipe_time_of_day',
 'match_outcome',
 'madrid_location']

In [53]:
# Generar edades entre 18 y 50
np.random.seed(42)  # para que sea reproducible
df["age"] = np.random.randint(18, 51, size=len(df))

In [54]:
df["age"].head()

0    46
1    32
2    25
3    38
4    36
Name: age, dtype: int64

In [56]:
df.sample(3)

Unnamed: 0,gender,sexual_orientation,location_type,income_bracket,education_level,interest_tags,app_usage_time_min,app_usage_time_label,swipe_right_ratio,swipe_right_label,likes_received,mutual_matches,profile_pics_count,bio_length,message_sent_count,emoji_usage_rate,last_active_hour,swipe_time_of_day,match_outcome,madrid_location,age
22481,Male,Pansexual,Urban,Low,Phd,"Skating, Traveling, Makeup",117,High,0.44,Optimistic,65,9,1,341,88,0.18,10,Early Morning,One-Sided Like,Tetuán,33
19500,Non-Binary,Queer,Remote Area,Lower-Middle,No Formal Education,"Investing, Coding, Parenting",299,Extreme User,0.57,Optimistic,106,5,4,464,0,0.21,4,Morning,Chat Ignored,Otro,49
47412,Non-Binary,Pansexual,Metro,Middle,Associate’S,"Spirituality, Pets, Anime",118,High,0.84,Swipe Maniac,158,17,4,58,92,0.38,19,Early Morning,Date Happened,Lavapiés,32


In [57]:
df["madrid_location"].unique()

array(['Chamberí', 'Las Rozas', 'Parla', 'Lavapiés', 'Usera', 'Otro',
       'Centro', 'Majadahonda', 'Tetuán', 'La Latina', 'Pinto', 'Getafe',
       'Alcorcón', 'Arganzuela', 'Salamanca', 'Fuenlabrada',
       'Carabanchel', 'Moncloa', 'Retiro', 'Pozuelo De Alarcón',
       'Leganés'], dtype=object)

In [58]:
df["madrid_location"].value_counts()

madrid_location
Otro                  25193
Tetuán                 1987
Usera                  1943
Carabanchel            1910
La Latina              1687
Centro                 1677
Lavapiés               1671
Arganzuela             1640
Moncloa                1618
Parla                  1587
Pinto                  1584
Fuenlabrada            1549
Salamanca               815
Majadahonda             805
Las Rozas               802
Pozuelo De Alarcón      800
Chamberí                790
Retiro                  773
Leganés                 402
Getafe                  387
Alcorcón                380
Name: count, dtype: int64