# Cervical Cancer Risk Factors
- Sara Echeverría 21371
- Melissa Pérez 21385

Repositorio de github: https://github.com/bl33h/cervicalCancerRiskFactors

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import shapiro
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
df = pd.read_csv('data/risk_factors_cervical_cancer.csv', encoding='latin1')
df.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [3]:
df.describe()

Unnamed: 0,Age,STDs: Number of diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
count,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0,858.0
mean,26.820513,0.087413,0.020979,0.01049,0.020979,0.027972,0.040793,0.086247,0.051282,0.064103
std,8.497948,0.302545,0.143398,0.101939,0.143398,0.164989,0.197925,0.280892,0.220701,0.245078
min,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,84.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
dataTypes = df.dtypes
print(dataTypes)

Age                                    int64
Number of sexual partners             object
First sexual intercourse              object
Num of pregnancies                    object
Smokes                                object
Smokes (years)                        object
Smokes (packs/year)                   object
Hormonal Contraceptives               object
Hormonal Contraceptives (years)       object
IUD                                   object
IUD (years)                           object
STDs                                  object
STDs (number)                         object
STDs:condylomatosis                   object
STDs:cervical condylomatosis          object
STDs:vaginal condylomatosis           object
STDs:vulvo-perineal condylomatosis    object
STDs:syphilis                         object
STDs:pelvic inflammatory disease      object
STDs:genital herpes                   object
STDs:molluscum contagiosum            object
STDs:AIDS                             object
STDs:HIV  

In [9]:
dfN = pd.read_csv('data/risk_factors_cervical_cancer.csv', encoding='latin1')

convertColumns = ['Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 
                  'Smokes (years)', 'Smokes (packs/year)', 'Hormonal Contraceptives (years)', 
                  'IUD (years)', 'STDs (number)', 'STDs: Number of diagnosis', 
                  'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis']
for col in convertColumns:
    dfN[col] = pd.to_numeric(dfN[col], errors='coerce')

# Corrección: Usar dfN.columns en lugar de df.columns
for col in dfN.columns:
    if dfN[col].dtype == 'object':
        dfN[col].fillna(dfN[col].mode()[0], inplace=True)
    else:
        dfN[col].fillna(dfN[col].median(), inplace=True)

# Convertir específicamente las columnas que deben ser enteras
intColumns = ['Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies']
for col in intColumns:
    # Primero rellenar NaN con -1 para indicar claramente un valor faltante
    dfN[col] = dfN[col].fillna(-1).astype(int)

binaryCols = ['Smokes', 'Hormonal Contraceptives', 'IUD', 'STDs', 'STDs:condylomatosis', 
              'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis', 
              'STDs:pelvic inflammatory disease', 'STDs:genital herpes', 'STDs:molluscum contagiosum', 
              'STDs:AIDS', 'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV']
for col in binaryCols:
    dfN[col] = dfN[col].apply(lambda x: 1 if x == 'Yes' or x == 'True' else 0)

dfN.to_csv('data/cleanedCervicalCancerRiskFactors.csv', index=False)

In [11]:
dataTypesN = dfN.dtypes
print(dataTypesN)

Age                                     int64
Number of sexual partners               int32
First sexual intercourse                int32
Num of pregnancies                      int32
Smokes                                  int64
Smokes (years)                        float64
Smokes (packs/year)                   float64
Hormonal Contraceptives                 int64
Hormonal Contraceptives (years)       float64
IUD                                     int64
IUD (years)                           float64
STDs                                    int64
STDs (number)                         float64
STDs:condylomatosis                     int64
STDs:cervical condylomatosis           object
STDs:vaginal condylomatosis             int64
STDs:vulvo-perineal condylomatosis      int64
STDs:syphilis                           int64
STDs:pelvic inflammatory disease        int64
STDs:genital herpes                     int64
STDs:molluscum contagiosum              int64
STDs:AIDS                         