<a href="https://colab.research.google.com/github/amnasyed1/datasci_9_data_prep/blob/main/datasci_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

#**First Dataset**

In [12]:
df_dle = pd.read_csv('death_and_life_expectancy.csv')
df_dle

Unnamed: 0,Year,Race,Sex,Average Life Expectancy (Years),Age-adjusted Death Rate
0,1900,All Races,Both Sexes,47.3,2518.0
1,1901,All Races,Both Sexes,49.1,2473.1
2,1902,All Races,Both Sexes,51.5,2301.3
3,1903,All Races,Both Sexes,50.5,2379.0
4,1904,All Races,Both Sexes,47.6,2502.5
...,...,...,...,...,...
1066,2014,White,Male,76.7,853.4
1067,2015,White,Male,76.6,861.9
1068,2016,White,Male,76.4,858.9
1069,2017,White,Male,76.4,863.4


In [13]:
df_dle.columns

Index(['Year', 'Race', 'Sex', 'Average Life Expectancy (Years)',
       'Age-adjusted Death Rate'],
      dtype='object')

In [18]:
df_dle.size

5355

In [14]:
missing_values = df_dle.isnull().sum()
missing_values

Year                               0
Race                               0
Sex                                0
Average Life Expectancy (Years)    6
Age-adjusted Death Rate            0
dtype: int64

In [17]:
df_dle = df_dle.fillna(0)
df_dle

Unnamed: 0,Year,Race,Sex,Average Life Expectancy (Years),Age-adjusted Death Rate
0,1900,All Races,Both Sexes,47.3,2518.0
1,1901,All Races,Both Sexes,49.1,2473.1
2,1902,All Races,Both Sexes,51.5,2301.3
3,1903,All Races,Both Sexes,50.5,2379.0
4,1904,All Races,Both Sexes,47.6,2502.5
...,...,...,...,...,...
1066,2014,White,Male,76.7,853.4
1067,2015,White,Male,76.6,861.9
1068,2016,White,Male,76.4,858.9
1069,2017,White,Male,76.4,863.4


In [21]:
len(df_dle)

1071

In [24]:
df_dle.dtypes

Year                                 int64
Race                                object
Sex                                 object
Average Life Expectancy (Years)    float64
Age-adjusted Death Rate            float64
dtype: object

In [22]:
df_dle.sample(5)

Unnamed: 0,Year,Race,Sex,Average Life Expectancy (Years),Age-adjusted Death Rate
222,2003,All Races,Female,79.7,715.2
154,1935,All Races,Female,63.9,1690.6
206,1987,All Races,Female,78.3,773.8
72,1972,All Races,Both Sexes,71.2,1214.8
1037,1985,White,Male,71.8,1249.5


In [34]:
# Assuming 'year' is the column you want to encode
encoder = OrdinalEncoder()
df_dle[['Year']] = encoder.fit_transform(df_dle[['Year']])

# Create a DataFrame with the ordinal encoding mapping
df_mapping_year = pd.DataFrame(encoder.categories_[0], columns=['Year'])
df_mapping_year['year_ordinal'] = df_mapping_year.index

# Display the DataFrame with the ordinal encoding mapping
df_mapping_year

Unnamed: 0,Year,year_ordinal
0,0.0,0
1,1.0,1
2,2.0,2
3,3.0,3
4,4.0,4
...,...,...
114,114.0,114
115,115.0,115
116,116.0,116
117,117.0,117


In [36]:
#'Year' to categorical
df_dle['Year'] = df_dle['Year'].astype('category')

encoder = OrdinalEncoder()
df_dle['Year'] = encoder.fit_transform(df_dle[['Year']])
df_dle.head()

Unnamed: 0,Year,Race,Sex,Average Life Expectancy (Years),Age-adjusted Death Rate
0,0.0,0.0,0.0,47.3,2518.0
1,1.0,0.0,0.0,49.1,2473.1
2,2.0,0.0,0.0,51.5,2301.3
3,3.0,0.0,0.0,50.5,2379.0
4,4.0,0.0,0.0,47.6,2502.5


In [38]:
#'Race' to categorical
df_dle['Race'] = df_dle['Race'].astype('category')

encoder = OrdinalEncoder()
df_dle['Race'] = encoder.fit_transform(df_dle[['Race']])
df_dle.head()

Unnamed: 0,Year,Race,Sex,Average Life Expectancy (Years),Age-adjusted Death Rate
0,0.0,0.0,0.0,47.3,2518.0
1,1.0,0.0,0.0,49.1,2473.1
2,2.0,0.0,0.0,51.5,2301.3
3,3.0,0.0,0.0,50.5,2379.0
4,4.0,0.0,0.0,47.6,2502.5


In [40]:
#columns that will be encoded
categ_columns = ['Year', 'Race', 'Sex']

# Initialize and fit OrdinalEncoder
encoder = OrdinalEncoder()
df_dle[categ_columns] = encoder.fit_transform(df_dle[categ_columns])

# Inspect the encoded DataFrame
df_dle.head(10)

Unnamed: 0,Year,Race,Sex,Average Life Expectancy (Years),Age-adjusted Death Rate
0,0.0,0.0,0.0,47.3,2518.0
1,1.0,0.0,0.0,49.1,2473.1
2,2.0,0.0,0.0,51.5,2301.3
3,3.0,0.0,0.0,50.5,2379.0
4,4.0,0.0,0.0,47.6,2502.5
5,5.0,0.0,0.0,48.7,2423.7
6,6.0,0.0,0.0,48.7,2399.0
7,7.0,0.0,0.0,47.6,2494.4
8,8.0,0.0,0.0,51.1,2298.9
9,9.0,0.0,0.0,52.1,2249.2


### Dataset Splitting

In [23]:
# Define the features and the target variable
X = df_dle.drop('Average Life Expectancy (Years)', axis=1)  # Features
y = df_dle['Average Life Expectancy (Years)']               # Target variable

In [41]:
# Split the scaled data into training, validation, and testing sets (70%, 15%, 15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [43]:
# Check the size of each set
(X_train.shape, X_val.shape, X_test.shape)

((749, 4), (161, 4), (161, 4))