In [17]:
# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# ------------------------------------------------------------------------------
from scipy.stats import shapiro, kstest

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [18]:
df = pd.read_csv("okcupid.csv")

df.head()

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,height,income,job,last_online,location,offspring,pets,religion,sign,smokes,speaks,profile_completeness,essay_word_count,profile_views_last_month,messages_sent_last_week,likes_received,mutual_matches,time_spent_daily,swipe_right_ratio,swipe_right_label
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",75.0,-1,transportation,2012-06-28-20-30,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,sometimes,english,100,450,1176,20,147,39,52,0.69,Optimistic
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,70.0,80000,hospitality / travel,2012-06-29-21-41,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,no,"english (fluently), spanish (poorly), french (...",60,268,1509,7,76,32,41,0.56,Balanced
2,38,available,m,straight,thin,anything,socially,,graduated from masters program,,68.0,-1,,2012-06-27-09-10,"san francisco, california",,has cats,,pisces but it doesn&rsquo;t matter,no,"english, french, c++",90,856,910,21,221,129,29,0.65,Optimistic
3,23,single,m,straight,thin,vegetarian,socially,,working on college/university,white,71.0,20000,student,2012-06-28-14-22,"berkeley, california",doesn't want kids,likes cats,,pisces,no,"english, german (poorly)",70,75,1344,19,57,15,67,0.61,Optimistic
4,29,single,m,straight,athletic,,socially,never,graduated from college/university,"asian, black, other",66.0,-1,artistic / musical / writer,2012-06-27-21-26,"san francisco, california",,likes dogs and likes cats,,aquarius,no,english,50,105,1180,12,57,25,29,0.36,Balanced


In [19]:
# reviso el tamaño del df

print(f"El número de filas que tenemos es {df.shape[0]}, y el número de columnas es {df.shape[1]}")

El número de filas que tenemos es 59946, y el número de columnas es 30


In [20]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,59946.0,32.34029,9.452779,18.0,26.0,30.0,37.0,110.0
height,59943.0,68.295281,3.994803,1.0,66.0,68.0,71.0,95.0
income,59946.0,20033.222534,97346.192104,-1.0,-1.0,-1.0,-1.0,1000000.0
profile_completeness,59946.0,81.047776,27.988025,0.0,70.0,90.0,100.0,100.0
essay_word_count,59946.0,356.896106,295.830482,0.0,159.0,299.0,481.0,10602.0
profile_views_last_month,59946.0,1029.695726,561.287949,50.0,547.0,1033.0,1515.75,1999.0
messages_sent_last_week,59946.0,11.573733,8.537646,0.0,4.0,11.0,18.0,29.0
likes_received,59946.0,111.627031,67.180995,0.0,71.0,105.0,144.0,2174.0
mutual_matches,59946.0,38.718764,30.169834,0.0,18.0,32.0,53.0,665.0
time_spent_daily,59946.0,61.770794,20.187392,5.0,48.0,62.0,75.0,142.0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       59946 non-null  int64  
 1   status                    59946 non-null  object 
 2   sex                       59946 non-null  object 
 3   orientation               59946 non-null  object 
 4   body_type                 54650 non-null  object 
 5   diet                      35551 non-null  object 
 6   drinks                    56961 non-null  object 
 7   drugs                     45866 non-null  object 
 8   education                 53318 non-null  object 
 9   ethnicity                 54266 non-null  object 
 10  height                    59943 non-null  float64
 11  income                    59946 non-null  int64  
 12  job                       51748 non-null  object 
 13  last_online               59946 non-null  object 
 14  locati

In [22]:
# Detecto las columnas objects:

col_objects = df.select_dtypes(include='object').columns.tolist()

col_objects

['status',
 'sex',
 'orientation',
 'body_type',
 'diet',
 'drinks',
 'drugs',
 'education',
 'ethnicity',
 'job',
 'last_online',
 'location',
 'offspring',
 'pets',
 'religion',
 'sign',
 'smokes',
 'speaks',
 'swipe_right_label']

In [24]:
# Detecto las columnas numericas:

col_num = df.select_dtypes(exclude='object').columns.tolist()

col_num

['age',
 'height',
 'income',
 'profile_completeness',
 'essay_word_count',
 'profile_views_last_month',
 'messages_sent_last_week',
 'likes_received',
 'mutual_matches',
 'time_spent_daily',
 'swipe_right_ratio']

In [25]:
df.duplicated().sum()

np.int64(0)

In [26]:
df.nunique()

age                            54
status                          5
sex                             2
orientation                     3
body_type                      12
diet                           18
drinks                          6
drugs                           3
education                      32
ethnicity                     217
height                         60
income                         13
job                            21
last_online                 30123
location                      199
offspring                      15
pets                           15
religion                       45
sign                           48
smokes                          5
speaks                       7647
profile_completeness           11
essay_word_count             1786
profile_views_last_month     1950
messages_sent_last_week        30
likes_received                541
mutual_matches                270
time_spent_daily              134
swipe_right_ratio              91
swipe_right_la

In [27]:
for columna in df:
    print(f'La columna {columna} tiene estos valores únicos:')
    print(df[columna].unique())
    print('----------------------------')

La columna age tiene estos valores únicos:
[ 22  35  38  23  29  32  31  24  37  28  30  39  33  26  27  20  25  40
  36  21  34  43  46  41  42  45  18  55  50  59  44  48  54  51  62  52
  19  58  66  53  63  47  49  61  60  57  56  65  64  68 110  69  67 109]
----------------------------
La columna status tiene estos valores únicos:
['single' 'available' 'seeing someone' 'married' 'unknown']
----------------------------
La columna sex tiene estos valores únicos:
['m' 'f']
----------------------------
La columna orientation tiene estos valores únicos:
['straight' 'bisexual' 'gay']
----------------------------
La columna body_type tiene estos valores únicos:
['a little extra' 'average' 'thin' 'athletic' 'fit' nan 'skinny' 'curvy'
 'full figured' 'jacked' 'rather not say' 'used up' 'overweight']
----------------------------
La columna diet tiene estos valores únicos:
['strictly anything' 'mostly other' 'anything' 'vegetarian' nan
 'mostly anything' 'mostly vegetarian' 'strictly vegan'


In [28]:
df.isna().sum() / df.shape[0] *100

age                          0.000000
status                       0.000000
sex                          0.000000
orientation                  0.000000
body_type                    8.834618
diet                        40.694959
drinks                       4.979482
drugs                       23.487806
education                   11.056618
ethnicity                    9.475194
height                       0.005005
income                       0.000000
job                         13.675641
last_online                  0.000000
location                     0.000000
offspring                   59.321723
pets                        33.231575
religion                    33.740366
sign                        18.443266
smokes                       9.194942
speaks                       0.083408
profile_completeness         0.000000
essay_word_count             0.000000
profile_views_last_month     0.000000
messages_sent_last_week      0.000000
likes_received               0.000000
mutual_match

In [29]:
# hago una funcion de limpieza de texto.

def limpiar_texto(col):
    return col.str.strip().str.title() # .strip -> elimino espacios  // paso a Title para respetar los nombres propios.

In [30]:
# aplico la funcion a mis columnas categoricas.

for col in col_objects:
    df[col] = limpiar_texto(df[col])

col_objects

['status',
 'sex',
 'orientation',
 'body_type',
 'diet',
 'drinks',
 'drugs',
 'education',
 'ethnicity',
 'job',
 'last_online',
 'location',
 'offspring',
 'pets',
 'religion',
 'sign',
 'smokes',
 'speaks',
 'swipe_right_label']