In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [8]:
filename = "../data/train.csv"
filetest = "../data/test.csv"

In [9]:
df_train = pd.read_csv(filename)
df_test = pd.read_csv(filetest)
# Displaying an overview of the data
data_overview = {
    "Number of Rows": df_train.shape[0],
    "Number of Columns": df_train.shape[1],
    "Column Names": df_train.columns.tolist(),
    "First 5 Rows": df_train.head()
}

data_overview

{'Number of Rows': 7905,
 'Number of Columns': 20,
 'Column Names': ['id',
  'N_Days',
  'Drug',
  'Age',
  'Sex',
  'Ascites',
  'Hepatomegaly',
  'Spiders',
  'Edema',
  'Bilirubin',
  'Cholesterol',
  'Albumin',
  'Copper',
  'Alk_Phos',
  'SGOT',
  'Tryglicerides',
  'Platelets',
  'Prothrombin',
  'Stage',
  'Status'],
 'First 5 Rows':    id  N_Days             Drug    Age Sex Ascites Hepatomegaly Spiders Edema  \
 0   0     999  D-penicillamine  21532   M       N            N       N     N   
 1   1    2574          Placebo  19237   F       N            N       N     N   
 2   2    3428          Placebo  13727   F       N            Y       Y     Y   
 3   3    2576          Placebo  18460   F       N            N       N     N   
 4   4     788          Placebo  16658   F       N            Y       N     N   
 
    Bilirubin  Cholesterol  Albumin  Copper  Alk_Phos    SGOT  Tryglicerides  \
 0        2.3          316     3.35     172    1601.0  179.80             63   
 1        

In [10]:
df_train.dtypes

id                 int64
N_Days             int64
Drug              object
Age                int64
Sex               object
Ascites           object
Hepatomegaly      object
Spiders           object
Edema             object
Bilirubin        float64
Cholesterol        int64
Albumin          float64
Copper             int64
Alk_Phos         float64
SGOT             float64
Tryglicerides      int64
Platelets          int64
Prothrombin      float64
Stage              int64
Status            object
dtype: object

In [11]:
df_train.describe()

Unnamed: 0,id,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
count,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0,7905.0
mean,3952.0,2030.173308,18373.14649,2.594485,350.561923,3.548323,83.902846,1816.74525,114.604602,115.340164,265.228969,10.629462,3.032511
std,2282.121272,1094.233744,3679.958739,3.81296,195.379344,0.346171,75.899266,1903.750657,48.790945,52.530402,87.465579,0.781735,0.866511
min,0.0,41.0,9598.0,0.3,120.0,1.96,4.0,289.0,26.35,33.0,62.0,9.0,1.0
25%,1976.0,1230.0,15574.0,0.7,248.0,3.35,39.0,834.0,75.95,84.0,211.0,10.0,2.0
50%,3952.0,1831.0,18713.0,1.1,298.0,3.58,63.0,1181.0,108.5,104.0,265.0,10.6,3.0
75%,5928.0,2689.0,20684.0,3.0,390.0,3.77,102.0,1857.0,137.95,139.0,316.0,11.0,4.0
max,7904.0,4795.0,28650.0,28.0,1775.0,4.64,588.0,13862.4,457.25,598.0,563.0,18.0,4.0


In [12]:
# Finding the range of values for each column, considering different types of data

# Function to determine the range for numerical and categorical columns
def determine_range(column):
    if column.dtype == 'object':  # For categorical data
        return column.unique().tolist()
    else:  # For numerical data
        return [column.min(), column.max()]

# Applying the function to each column in the dataframe
column_ranges = {col: determine_range(df_train[col]) for col in df_train.columns}

column_ranges



{'id': [0, 7904],
 'N_Days': [41, 4795],
 'Drug': ['D-penicillamine', 'Placebo'],
 'Age': [9598, 28650],
 'Sex': ['M', 'F'],
 'Ascites': ['N', 'Y'],
 'Hepatomegaly': ['N', 'Y'],
 'Spiders': ['N', 'Y'],
 'Edema': ['N', 'Y', 'S'],
 'Bilirubin': [0.3, 28.0],
 'Cholesterol': [120, 1775],
 'Albumin': [1.96, 4.64],
 'Copper': [4, 588],
 'Alk_Phos': [289.0, 13862.4],
 'SGOT': [26.35, 457.25],
 'Tryglicerides': [33, 598],
 'Platelets': [62, 563],
 'Prothrombin': [9.0, 18.0],
 'Stage': [1, 4],
 'Status': ['D', 'C', 'CL']}

In [13]:
# Identify object-type columns
object_columns = df_train.select_dtypes(include=['object']).columns

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Convert object-type columns to numerical
for col in object_columns:
    df_train[col] = label_encoder.fit_transform(df_train[col])
df_train.dtypes
df_train.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,0,21532,1,0,0,0,0,2.3,316,3.35,172,1601.0,179.8,63,394,9.7,3,2
1,1,2574,1,19237,0,0,0,0,0,0.9,364,3.54,63,1440.0,134.85,88,361,11.0,3,0
2,2,3428,1,13727,0,0,1,1,2,3.3,299,3.55,131,1029.0,119.35,50,199,11.7,4,2
3,3,2576,1,18460,0,0,0,0,0,0.6,256,3.5,58,1653.0,71.3,96,269,10.7,3,0
4,4,788,1,16658,0,0,1,0,0,1.1,346,3.65,63,1181.0,125.55,96,298,10.6,4,0


0


0       1
1       0
2       0
3       0
4       0
       ..
7900    0
7901    0
7902    0
7903    1
7904    0
Name: Sex, Length: 7905, dtype: int32