In [2]:
import pandas as pd
import os

In [3]:
# Get the current working directory
current_directory = os.getcwd()

# Name of CSV file
csv_filename = "cost-of-living.csv" 

# Combine the current directory path with the CSV file name to get the full file path
dataset_path = os.path.join(current_directory, csv_filename)

df = pd.read_csv(dataset_path)

In [6]:
# Remove the "Rank" and "Cost of Living Plus Rent Index" columns
df = df.drop(columns=['Rank', 'Cost of Living Plus Rent Index'])

print(df.head())

                  City  Cost of Living Index  Rent Index  Groceries Index  \
0    Hamilton, Bermuda                149.02       96.10           157.89   
1  Zurich, Switzerland                131.24       69.26           136.14   
2   Basel, Switzerland                130.93       49.38           137.07   
3     Zug, Switzerland                128.13       72.12           132.61   
4  Lugano, Switzerland                123.99       44.99           129.17   

   Restaurant Price Index  Local Purchasing Power Index  
0                  155.22                         79.43  
1                  132.52                        129.79  
2                  130.95                        111.53  
3                  130.93                        143.40  
4                  119.80                        111.96  


In [7]:
# Filter out rows where the city is not in the United States
df = df[df['City'].str.contains(", [A-Z]{2}, United States")]

print(df.head())

                                City  Cost of Living Index  Rent Index  \
10       Honolulu, HI, United States                103.65       65.07   
13       New York, NY, United States                100.00      100.00   
18  Santa Barbara, CA, United States                 95.01       78.42   
20       Berkeley, CA, United States                 94.36       88.22   
21  San Francisco, CA, United States                 93.91      108.42   

    Groceries Index  Restaurant Price Index  Local Purchasing Power Index  
10           114.92                   94.28                         89.24  
13           100.00                  100.00                        100.00  
18            99.53                   99.41                         93.86  
20           106.23                   78.85                         85.78  
21            97.05                   93.40                        133.16  


In [8]:
# Split the "City" column into "city" and "state" columns
df[['city', 'state', 'country']] = df['City'].str.rsplit(', ', n=2, expand=True)

                                City  Cost of Living Index  Rent Index  \
10       Honolulu, HI, United States                103.65       65.07   
13       New York, NY, United States                100.00      100.00   
18  Santa Barbara, CA, United States                 95.01       78.42   
20       Berkeley, CA, United States                 94.36       88.22   
21  San Francisco, CA, United States                 93.91      108.42   

    Groceries Index  Restaurant Price Index  Local Purchasing Power Index  \
10           114.92                   94.28                         89.24   
13           100.00                  100.00                        100.00   
18            99.53                   99.41                         93.86   
20           106.23                   78.85                         85.78   
21            97.05                   93.40                        133.16   

             city state        country  
10       Honolulu    HI  United States  
13       N

In [9]:
# Drop the original "City" column and the 'country' column as it's not needed anymore
df = df.drop(columns=['City', 'country'])

print(df.head())

    Cost of Living Index  Rent Index  Groceries Index  Restaurant Price Index  \
10                103.65       65.07           114.92                   94.28   
13                100.00      100.00           100.00                  100.00   
18                 95.01       78.42            99.53                   99.41   
20                 94.36       88.22           106.23                   78.85   
21                 93.91      108.42            97.05                   93.40   

    Local Purchasing Power Index           city state  
10                         89.24       Honolulu    HI  
13                        100.00       New York    NY  
18                         93.86  Santa Barbara    CA  
20                         85.78       Berkeley    CA  
21                        133.16  San Francisco    CA  


In [10]:
# Rearrange columns so "city" and "state" are first
columns = ['city', 'state'] + [col for col in df.columns if col not in ['city', 'state']]
df = df[columns]

print(df.head())

             city state  Cost of Living Index  Rent Index  Groceries Index  \
10       Honolulu    HI                103.65       65.07           114.92   
13       New York    NY                100.00      100.00           100.00   
18  Santa Barbara    CA                 95.01       78.42            99.53   
20       Berkeley    CA                 94.36       88.22           106.23   
21  San Francisco    CA                 93.91      108.42            97.05   

    Restaurant Price Index  Local Purchasing Power Index  
10                   94.28                         89.24  
13                  100.00                        100.00  
18                   99.41                         93.86  
20                   78.85                         85.78  
21                   93.40                        133.16  


In [11]:
# State abbreviations to full names mapping
state_mapping = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California',
    'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia',
    'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa',
    'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire',
    'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina',
    'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania',
    'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee',
    'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington',
    'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'
}

# Replace state abbreviations with full names
df['state'] = df['state'].map(state_mapping)

print(df.head())

             city       state  Cost of Living Index  Rent Index  \
10       Honolulu      Hawaii                103.65       65.07   
13       New York    New York                100.00      100.00   
18  Santa Barbara  California                 95.01       78.42   
20       Berkeley  California                 94.36       88.22   
21  San Francisco  California                 93.91      108.42   

    Groceries Index  Restaurant Price Index  Local Purchasing Power Index  
10           114.92                   94.28                         89.24  
13           100.00                  100.00                        100.00  
18            99.53                   99.41                         93.86  
20           106.23                   78.85                         85.78  
21            97.05                   93.40                        133.16  


In [13]:
# Save the modified dataframe to a new CSV file
df.to_csv(dataset_path, index=False)