In [1]:
import pandas as pd
from utils import check_duplicates

In [2]:
checks = {True:"OK", False: "NOK"}

### Step 1. Merge all csv files into a single table

In [3]:
# Dict to be used to plot on states on map, it only recognizes 2 letter codes.
STATE2ABBREV = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}

In [4]:
def clean_raw_weather(city_name:str, info_cities:pd.DataFrame, date_min:str, date_max:str)-> pd.DataFrame:
    '''
    Creates a dataset of the daily max temperature, min temperature, precipitation
    with added info of city lat, lon and name
    
    Input:
    city_name (str) : Name of the city to process (case insensitive)
    info_cities (pd.DataFrame) : DataFrame with the city information
    date_min (str) : minimum date for filtering
    date_max (str) : maximum date for filtering
    
    Output:
    (pd.DataFrame) : Processed dataframe
    
    '''
    file_name = info_cities[info_cities["Name"].apply(lambda x : x.lower())==city_name.lower()].ID.iloc[0]
    df = pd.read_csv(f"./data/1_raw/cities/{file_name}.csv", index_col=0, parse_dates=["Date"])
    # Add attributes
    df["Lat"] = info_cities[info_cities["ID"]==file_name].iloc[0]["Lat"]
    df["Lon"] = info_cities[info_cities["ID"]==file_name].iloc[0]["Lon"]
    df["City_name"] = info_cities[info_cities["ID"]==file_name].iloc[0]["Name"]

    # filter on max and min dates
    df = df.loc[(df["Date"] <= date_max)&(df["Date"] >= date_min)]
    
    return df

def prepare_weather(info_cities:pd.DataFrame, date_min:str, date_max:str)->pd.DataFrame:
    '''
    Concatenate the different cities dataframe into a single dataframe sorted by date and city name.
    Filter between min and max dates.
    
    Input:
    info_cities (pd.DataFrame) : DataFrame with the city information
    date_min (str) : minimum date for filtering
    date_max (str) : maximum date for filtering
    
    Output:
    (pd.DataFrame) : Processed dataframe
    
    '''
    city_names = info_cities["Name"].unique()
    weather_timeseries = [clean_raw_weather(city_name, info_cities, date_min, date_max) for city_name in city_names]
    all_cities = pd.concat(weather_timeseries, axis=0).sort_values(by=["Date","City_name"])
    return all_cities

def prepare_demography(df:pd.DataFrame)->pd.DataFrame:
    '''
    Clean the demography dataset
    
    Input:
    df (pd.DataFrame) : Demographics dataset
    
    Output:
    (pd.DataFrame) : Processed dataframe
    
    '''
    # Drop useless demographic data
    df = df[["City", "State", "Median Age", "Total Population", "Average Household Size"]]
    # Since we do not care about racial demographics, we can drop the multiline
    df = df.groupby('City').first()
    # compute state Abbreviation
    df["STATE_CODE"] = df["State"].map(STATE2ABBREV)
    return df

def get_merge_name(names:pd.Series)->pd.Series:
    '''
    Uniformize city names for simpler merging by lowering and removing all whitespaces.
    
    Input:
    df (pd.Series) : Raw city names
    
    Output:
    (pd.Series) : "Clean" city names
    
    '''
    return names.apply(lambda x : "".join(x.lower().split())).values


def feature_engineering_external_data(external_data: pd.DataFrame) -> pd.DataFrame:
    '''
    Compute new features based on the original features from external_data.

    Input:
    external_data (pd.DataFrame): input DataFrame

    Output:
    (pd.DataFrame):  DataFrame with additional features
    '''
    # To Be Completed
    
    return external_data

In [5]:
# inputs
date_min = "2011-01-01"
date_max = "2015-12-31"

# read files
info_cities = pd.read_csv("./data/1_raw/cities/city_info.csv", index_col=0)
demography_raw = pd.read_csv("./data/1_raw/demographics/us-cities-demographics.csv", delimiter=";")

# cleaning temperature and precipitations
all_cities = prepare_weather(info_cities, date_min, date_max)

# cleaning demographics
demography = prepare_demography(demography_raw)

# merge external data into a single dataframe
all_cities["MergeName"] = get_merge_name(all_cities.City_name)
demography["MergeName"] = get_merge_name(pd.Series(demography.index))
external_data = pd.merge(left=all_cities, right=demography, how="left", on=["MergeName"]).drop(columns=["MergeName"])

# feature engineering
external_data = feature_engineering_external_data(external_data)

# check duplicates
c = checks.get(check_duplicates(external_data, ["City_name","Date"]), False)
print(f"Check duplicates: {c}")

# save to csv format
external_data.to_csv("./data/2_clean/external_data.csv", index=False)

Check duplicates: OK
