# Data Preprocessing

## Converting from txt to csv and adding column names

In [1]:
import pandas as pd

In [2]:
# Define the column specifications
col_specs = [(0, 4), (4, 6), (6, 8), (8, 11), (11, 13), (13, 14), (14, 15), (15, 16), (16, 18), (18, 26)]
col_names = [
    'Year', 'State Abbreviation', 'State FIPS Code', 'County FIPS Code', 
    'Registry', 'Race', 'Origin', 'Sex', 'Age', 'Population'
]

In [3]:
# Read the fixed-width file
df = pd.read_fwf('demographic_data.txt', colspecs=col_specs, header=None, names=col_names)

In [4]:
# Write the dataframe to a CSV file
df.to_csv('output.csv', index=False)

## Separating datasets by Age and Sex

In [5]:
import pandas as pd

# Load the uploaded CSV file to check its contents
file_path = 'output.csv'
data = pd.read_csv(file_path)

# Display the first few rows and the column headers of the dataset
data.head(), data.columns

(   Year State Abbreviation  State FIPS Code  County FIPS Code  Registry  Race  \
 0  1990                 AL                1                 1        99     1   
 1  1990                 AL                1                 1        99     1   
 2  1990                 AL                1                 1        99     1   
 3  1990                 AL                1                 1        99     1   
 4  1990                 AL                1                 1        99     1   
 
    Origin  Sex  Age  Population  
 0       0    1    0         239  
 1       0    1    1         821  
 2       0    1    2        1089  
 3       0    1    3        1144  
 4       0    1    4        1046  ,
 Index(['Year', 'State Abbreviation', 'State FIPS Code', 'County FIPS Code',
        'Registry', 'Race', 'Origin', 'Sex', 'Age', 'Population'],
       dtype='object'))

In [6]:
# Creating three datasets based on the requirements
# 1. Dataset by Sex
dataset_by_sex = data[['Year', 'State Abbreviation', 'State FIPS Code', 'County FIPS Code', 'Sex', 'Population']]

# 2. Dataset by Age
dataset_by_age = data[['Year', 'State Abbreviation', 'State FIPS Code', 'County FIPS Code', 'Age', 'Population']]

In [7]:
# Save each dataset as a CSV file
# Dataset by Sex
sex_csv_path = 'dataset_by_sex.csv'
dataset_by_sex.to_csv(sex_csv_path, index=False)

# Dataset by Age
age_csv_path = 'dataset_by_age.csv'
dataset_by_age.to_csv(age_csv_path, index=False)

# Paths to the saved files
sex_csv_path, age_csv_path

('dataset_by_sex.csv', 'dataset_by_age.csv')

## Aggregating rows

In [8]:
# Load the dataset to perform aggregation
dataset_by_sex = pd.read_csv('dataset_by_sex.csv')

# Group by specified columns and sum the population
aggregated_data = dataset_by_sex.groupby(['Year', 'State Abbreviation', 'State FIPS Code', 'County FIPS Code', 'Sex']).agg(
    Total_Population=pd.NamedAgg(column='Population', aggfunc='sum')
).reset_index()

# Display the aggregated dataset
aggregated_data.head()

Unnamed: 0,Year,State Abbreviation,State FIPS Code,County FIPS Code,Sex,Total_Population
0,1990,AK,2,10,1,7943
1,1990,AK,2,10,2,4133
2,1990,AK,2,20,1,117036
3,1990,AK,2,20,2,110578
4,1990,AK,2,50,1,7170


In [9]:
# Save the aggregated dataset as a CSV file
aggregated_csv_path = 'aggregated_dataset_by_sex.csv'
aggregated_data.to_csv(aggregated_csv_path, index=False)

# Provide the path to the saved file
aggregated_csv_path

'aggregated_dataset_by_sex.csv'

In [10]:
# Load the dataset to perform aggregation on Age
dataset_by_age = pd.read_csv('dataset_by_age.csv')

# Group by specified columns and sum the population
aggregated_data_by_age = dataset_by_age.groupby(['Year', 'State Abbreviation', 'State FIPS Code', 'County FIPS Code', 'Age']).agg(
    Total_Population=pd.NamedAgg(column='Population', aggfunc='sum')
).reset_index()

# Display the aggregated dataset
aggregated_data_by_age.head()

Unnamed: 0,Year,State Abbreviation,State FIPS Code,County FIPS Code,Age,Total_Population
0,1990,AK,2,10,0,211
1,1990,AK,2,10,1,739
2,1990,AK,2,10,2,727
3,1990,AK,2,10,3,515
4,1990,AK,2,10,4,749


In [11]:
# Save the aggregated dataset by Age as a CSV file
aggregated_by_age_csv_path = 'aggregated_dataset_by_age.csv'
aggregated_data_by_age.to_csv(aggregated_by_age_csv_path, index=False)

# Provide the path to the saved file
aggregated_by_age_csv_path

'aggregated_dataset_by_age.csv'