In [99]:
import pandas as pd

train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

In [100]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11879 entries, 0 to 11878
Data columns (total 27 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Id                                        11879 non-null  int64  
 1   Sex                                       10449 non-null  object 
 2   Sex Code                                  10426 non-null  object 
 3   State                                     10482 non-null  object 
 4   State Code                                11879 non-null  float64
 5   Year                                      7407 non-null   float64
 6   Year Code                                 7352 non-null   float64
 7   Ten-Year Age Groups                       10379 non-null  object 
 8   Ten-Year Age Groups Code                  10423 non-null  object 
 9   % of Total Deaths                         11879 non-null  float64
 10  Population                        

In [101]:
print(f"Rows Before: {len(train_data)}")

Rows Before: 11879


In [102]:
train_data.dropna(subset=["Target"] , inplace=True)

In [103]:
print(f"Rows After: {len(train_data)}")

Rows After: 7459


In [104]:

drop_cols = [
    'Id',
    'Sex',
    'temp_sensor_readout',
    'qc_flag_batch_3',
    'legacy_index_offset',
    'adjusted_pop_trend',
    'confidence_spread_metric',
    'Crude Rate Lower 95% Confidence Interval',
    'Crude Rate Upper 95% Confidence Interval',
    'Crude Rate Standard Error',
    'Year_dt',
    'State',
    'Crude Rate',
    'Population', 
    'Year Code',
    'Ten-Year Age Groups',
    'Contributing_Cause',
    'Underlying_Cause'
]

# Drop from train (includes temporal_alignment_proxy)
train_data.drop(columns=drop_cols + ['temporal_alignment_proxy'], inplace=True)

# Drop from test (no temporal_alignment_proxy)
test_data.drop(columns=drop_cols, inplace=True)

print(f"Train: {train_data.shape}")
print(f"Test: {test_data.shape}")

Train: (7459, 8)
Test: (5092, 7)


In [105]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7459 entries, 2 to 11878
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Sex Code                  6579 non-null   object 
 1   State Code                7459 non-null   float64
 2   Year                      4661 non-null   float64
 3   Ten-Year Age Groups Code  6531 non-null   object 
 4   % of Total Deaths         7459 non-null   float64
 5   State_Age_Combo           7459 non-null   object 
 6   Manner_of_Death           7459 non-null   int64  
 7   Target                    7459 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 524.5+ KB


In [110]:
year_median = train_data['Year'].median()
train_data['Year'].fillna(year_median, inplace=True)
test_data['Year'].fillna(year_median, inplace=True)

# Impute object columns with MODE
sex_code_mode = train_data['Sex Code'].mode()[0]
train_data['Sex Code'].fillna(sex_code_mode, inplace=True)
test_data['Sex Code'].fillna(sex_code_mode, inplace=True)

sex_code_mode = train_data['Ten-Year Age Groups Code'].mode()[0]
train_data['Ten-Year Age Groups Code'].fillna(sex_code_mode, inplace=True)
test_data['Ten-Year Age Groups Code'].fillna(sex_code_mode, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Year'].fillna(year_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Year'].fillna(year_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are set

In [111]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7459 entries, 2 to 11878
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Sex Code                  7459 non-null   object 
 1   State Code                7459 non-null   float64
 2   Year                      7459 non-null   float64
 3   Ten-Year Age Groups Code  7459 non-null   object 
 4   % of Total Deaths         7459 non-null   float64
 5   State_Age_Combo           7459 non-null   object 
 6   Manner_of_Death           7459 non-null   int64  
 7   Target                    7459 non-null   float64
dtypes: float64(4), int64(1), object(3)
memory usage: 524.5+ KB


In [113]:
from sklearn.preprocessing import LabelEncoder


encoder_sex = LabelEncoder()
encoder_age = LabelEncoder()
encoder_combo = LabelEncoder()

# Encode Sex Code
train_data['Sex Code'] = encoder_sex.fit_transform(train_data['Sex Code'].astype(str))
test_data['Sex Code'] = encoder_sex.transform(test_data['Sex Code'].astype(str))

# Encode Ten-Year Age Groups Code
train_data['Ten-Year Age Groups Code'] = encoder_age.fit_transform(train_data['Ten-Year Age Groups Code'].astype(str))
test_data['Ten-Year Age Groups Code'] = encoder_age.transform(test_data['Ten-Year Age Groups Code'].astype(str))
