In [29]:
import numpy as np
import pandas as pd
import pathlib

In [30]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sudalairajkumar/daily-temperature-of-major-cities")

print("Path to dataset files:", path)

Path to dataset files: /Users/abhiramsingireddy/.cache/kagglehub/datasets/sudalairajkumar/daily-temperature-of-major-cities/versions/1


*NOTE: low_memory=False forces pandas to read the whole file into memory and then convert it.  This is done because there are mixed data types in the columns.  Rather than pandas chunk the data and guess the data type and cause an issue when reading, pandas is forced to wait until all the data is there and then make the decision.*

EX: daily_temps_df = pd.read_csv("city_temperature.csv", low_memory=False)

## Step 1 (1 point): Find and clean bad data
* Using one of the techniques discussed this week (query, where):
    * Create a new data set that only contains the Year 2012
    * Identify any bad data in the *Day* column and delete the rows from the DataFrame


In [31]:
file_path = pathlib.Path(path) / "city_temperature.csv"
daily_temps_df = pd.read_csv(file_path, low_memory=False)
daily_temps_df.head()

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
0,Africa,Algeria,,Algiers,1,1,1995,64.2
1,Africa,Algeria,,Algiers,1,2,1995,49.4
2,Africa,Algeria,,Algiers,1,3,1995,48.8
3,Africa,Algeria,,Algiers,1,4,1995,46.4
4,Africa,Algeria,,Algiers,1,5,1995,47.9


In [32]:
temps_2012 = daily_temps_df.query("Year == 2012")
temps_2012 = temps_2012.where((temps_2012['Day'] >= 1) & (temps_2012['Day'] <= 31))
temps_2012_clean = temps_2012.dropna(subset=['Day'])
temps_2012_clean.head()

Unnamed: 0,Region,Country,State,City,Month,Day,Year,AvgTemperature
6209,Africa,Algeria,,Algiers,1.0,1.0,2012.0,50.4
6210,Africa,Algeria,,Algiers,1.0,2.0,2012.0,52.3
6211,Africa,Algeria,,Algiers,1.0,3.0,2012.0,52.8
6212,Africa,Algeria,,Algiers,1.0,4.0,2012.0,51.0
6213,Africa,Algeria,,Algiers,1.0,5.0,2012.0,52.9


## Step 1 (2 points):  Using the three date-related fields, create a new Column called "Date".
Using the cleaned dataset:
* Use the apply() method.
* The data type needs to be datetime.

In [33]:
temps_2012_clean = temps_2012_clean.copy()
temps_2012_clean['Date'] = temps_2012_clean.apply(lambda row: pd.to_datetime(f"{int(row['Year'])}-{int(row['Month'])}-{int(row['Day'])}"), axis=1)
temps_2012_clean[['Year', 'Month', 'Day', 'Date']].head()

Unnamed: 0,Year,Month,Day,Date
6209,2012.0,1.0,1.0,2012-01-01
6210,2012.0,1.0,2.0,2012-01-02
6211,2012.0,1.0,3.0,2012-01-03
6212,2012.0,1.0,4.0,2012-01-04
6213,2012.0,1.0,5.0,2012-01-05


## Step 3 (1 point): Map the Day's name based on the date
Using the Date value created in step two:
* Create a new column called "Day Name"
* Use the map() method to populate the value from the date using a pandas date function that returns the day name based on a date

In [34]:
temps_2012_clean['Day Name'] = temps_2012_clean['Date'].map(lambda x: x.day_name())
temps_2012_clean[['Date', 'Day Name']].head()

Unnamed: 0,Date,Day Name
6209,2012-01-01,Sunday
6210,2012-01-02,Monday
6211,2012-01-03,Tuesday
6212,2012-01-04,Wednesday
6213,2012-01-05,Thursday


## Step 4 (1point): What was the average temperature on Mondays in January in the US? 
Use all your knowledge to return the answer.

In [36]:
monday_january_us = temps_2012_clean.query("Country == 'US' and Month == 1 and `Day Name` == 'Monday'")
average_temp = monday_january_us['AvgTemperature'].mean()
print(f"The average temperature on Mondays in January in the US was {average_temp:.2f}°F")

The average temperature on Mondays in January in the US was 40.15°F
