# Presentation

In [1]:
import pandas as pd
import numpy as np
import urllib
import os
import gzip
import json
import re
from math import radians, cos, sin, asin, sqrt, pi, isnan
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Allston.Fojas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Step 1: Get recent NOAA Climate Data from the NOAA website to be able to automatically update the weather data twice a day

In [2]:
def get_noaa_data(file_name):
    """
    Gets the NOAA data from their website
    
    :param file_name: The name of the data file
    :returns: Output data filepath
    """
    
    # get the base URL and outfile path
    base_url = "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/"
    out_file_path = file_name[:-3]
    
    # get data file from URL
    response = urllib.request.urlopen(base_url + file_name)
    with open(out_file_path, "wb") as out_file:
        out_file.write(gzip.decompress(response.read()))
    print("NOAA data saved to %s" % out_file_path)
    return out_file_path

In [3]:
file_name = "2020.csv.gz"
out_file_path = get_noaa_data(file_name)

NOAA data saved to 2020.csv


## Step 2: Merge the NOAA Station Observation Data with the NOAA Station Lat/Long Data

### Read NOAA Observation Locally that we just Downloaded & Unzipped

In [4]:
def read_observation_data(noaa_data):
    """
    Reads in the NOAA data as a DataFrame
    
    :param noaa_data: The name of the NOAA file
    :returns: Current NOAA data as a DataFrame
    """
    
    # date column and column names
    date_col = "Date"
    col_names = ["Station ID", "Date", "Observation Type", "Observation Value", 
                 "Measurement Flag", "Quality Flag", "Source Flag", "Observation Time"]
    
    # get relevant weather data
    noaa_current_year = pd.read_csv(noaa_data, header=None, names=col_names)
    current_date = noaa_current_year.iloc[-1][date_col]
    noaa_current_day = noaa_current_year[noaa_current_year[date_col] == current_date].reset_index(drop=True)
    return noaa_current_day

In [5]:
observation_data = read_observation_data(out_file_path)

In [6]:
display(observation_data.shape)
observation_data.head()

(29353, 8)

Unnamed: 0,Station ID,Date,Observation Type,Observation Value,Measurement Flag,Quality Flag,Source Flag,Observation Time
0,ASN00001006,20200817,TMIN,210,,,a,
1,ASN00001006,20200817,PRCP,0,,,a,
2,ASN00001007,20200817,TMIN,251,,,a,
3,ASN00001007,20200817,PRCP,0,,,a,
4,ASN00001019,20200817,TMIN,171,,,a,


### Read Lat/Long Data from URL

In [7]:
def read_url_latlong_data():
    """
    Reads in the NOAA stations data as a DataFrame
    
    :returns: NOAA stations and lat/long data as a DataFrame
    """
    
    # stations URL and column names
    stations_url = "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt"
    col_names = ["Station ID", "Latitude", "Longitude", "Elevation"]
    station_latlong = pd.read_csv(stations_url, 
        sep="\s+",  # Fields are separated by one or more spaces
        usecols=[0, 1, 2, 3],  # Grab only the first 4 columns, plus the Name column
        na_values=[-999.9],  # Missing elevation is noted as -999.9
        header=None,
        names=col_names,
        engine="python")
    return station_latlong

### Read Lat/Long Data Locally

In [8]:
def get_stations_data(file_name):
    """
    Gets the NOAA stations data from their website
    
    :param file_name: The name of the data file
    :returns: Output data filepath
    """
    
    # base URL and outfile path
    base_url = "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/"
    out_file_path = file_name
    
    # checks if data file is already downloaded or not
    does_out_file_exist = os.path.exists(out_file_path)
    if not does_out_file_exist:
        response = urllib.request.urlopen(base_url + file_name)
        with open(out_file_path, "wb") as out_file:
            out_file.write(response.read())
        print("NOAA station data saved to %s" % out_file_path)
    else:
        print("NOAA station data already downloaded")
    return out_file_path

In [9]:
def read_local_latlong_data():
    """
    Reads in the local NOAA stations data as a DataFrame
    
    :returns: NOAA stations and lat/long data as a DataFrame
    """
    
    # file name, outfile path, and column names
    file_name = "ghcnd-stations.txt"
    out_file_path = get_stations_data(file_name)
    col_names = ["Station ID", "Latitude", "Longitude", "Elevation"]
    station_latlong = pd.read_csv(out_file_path, 
        sep="\s+",  # Fields are separated by one or more spaces
        usecols=[0, 1, 2, 3],  # Grab only the first 4 columns, plus the Name column
        na_values=[-999.9],  # Missing elevation is noted as -999.9
        header=None,
        names=col_names,
        engine="python")
    return station_latlong

### Function to Read Lat/Long Data from URL (or Locally if the URL fails)

In [10]:
def read_latlong_data():
    """
    Reads in the NOAA stations data from either URL (or Locally if the URL fails) as a DataFrame
    
    :returns: NOAA stations and lat/long data as a DataFrame
    """

    # try to read data from the URL. if it fails, read data locally
    try:
        print("trying to read lat long data from the URL..")
        latlong_data = read_url_latlong_data()
        print("reading data from URL successful")
    except HTTPError as e:
        print("reading data from URL failed. reading data locally instead..")
        latlong_data = read_local_latlong_data()
        print("reading data locally successful")
        
    return latlong_data

In [11]:
latlong_data = read_latlong_data()

trying to read lat long data from the URL..
reading data from URL successful


In [12]:
display(latlong_data.shape)
latlong_data.head()

(115082, 4)

Unnamed: 0,Station ID,Latitude,Longitude,Elevation
0,ACW00011604,17.1167,-61.7833,10.1
1,ACW00011647,17.1333,-61.7833,19.2
2,AE000041196,25.333,55.517,34.0
3,AEM00041194,25.255,55.364,10.4
4,AEM00041217,24.433,54.651,26.8


### Function to Merge NOAA Observation and Lat/Long Data

In [13]:
def merge_noaa_data(observation_data, latlong_data):
    """
    Merges the NOAA Station Observation data with the NOAA Lat/Long data into one DataFrame
    
    :param observation_data: The name of the NOAA station observation file
    :param latlong_data: The name of the NOAA lat/long file
    :returns: Merged NOAA data as a DataFrame
    """
    
    merged_data = observation_data.merge(latlong_data, left_on="Station ID", right_on="Station ID")
    return merged_data

In [14]:
merged_data = merge_noaa_data(observation_data, latlong_data)

In [15]:
display(merged_data.shape)
merged_data.head()

(29353, 11)

Unnamed: 0,Station ID,Date,Observation Type,Observation Value,Measurement Flag,Quality Flag,Source Flag,Observation Time,Latitude,Longitude,Elevation
0,ASN00001006,20200817,TMIN,210,,,a,,-15.51,128.1503,3.8
1,ASN00001006,20200817,PRCP,0,,,a,,-15.51,128.1503,3.8
2,ASN00001007,20200817,TMIN,251,,,a,,-13.7542,126.1485,6.0
3,ASN00001007,20200817,PRCP,0,,,a,,-13.7542,126.1485,6.0
4,ASN00001019,20200817,TMIN,171,,,a,,-14.2964,126.6453,23.0


### Function to Make the NOAA Lat/Long List

In [16]:
def make_noaa_latlon_list(merged_data):
    """
    Makes the NOAA lat/long list
    
    :param merged_data: Merged NOAA data
    :returns: List of NOAA lat/long data
    """
    
    # relevant columns and new columns
    relevant_cols = ["Station ID", "Observation Type", "Observation Value", "Latitude", "Longitude", "Elevation"]
    new_cols = ["Station ID", "Observation Type", "Observation Value", "lat", "lon", "Elevation"]
    
    # get relevant columns, set new column names, and change to dictionary
    noaa_latlon_data = merged_data[relevant_cols]
    noaa_latlon_data.columns = new_cols
    noaa_latlon_data_list = noaa_latlon_data.to_dict(orient="records")
    return noaa_latlon_data_list

In [17]:
noaa_latlon_data_list = make_noaa_latlon_list(merged_data)
noaa_latlon_data_list[:2]

[{'Station ID': 'ASN00001006',
  'Observation Type': 'TMIN',
  'Observation Value': 210,
  'lat': -15.51,
  'lon': 128.1503,
  'Elevation': 3.8},
 {'Station ID': 'ASN00001006',
  'Observation Type': 'PRCP',
  'Observation Value': 0,
  'lat': -15.51,
  'lon': 128.1503,
  'Elevation': 3.8}]

## Step 3: Read in DMA Data and Change Data Layout to a List of Dictionaries

In [18]:
def get_dma_data():
    """
    Gets the DMA and lat/long data from their website
    
    :returns: DMA and lat/long data as a DataFrame
    """
    
    dma_url = "https://gist.githubusercontent.com/perrydc/9ee4abc14dd5590434fe273f59cdf07f/raw/a9a61694e1612455e4c2c96c7039378757fed21e/dma.json"
    response = urllib.request.urlopen(dma_url)
    data = json.loads(response.read())
    dma = pd.DataFrame(data)
    return dma

In [19]:
dma = get_dma_data()

In [20]:
display(dma.shape)
dma.head()

(210, 6)

Unnamed: 0,dma,lat,lon,tvHomes,popShare,timezone
0,New York,40.6943,-73.9249,7348620,6.407,America/New_York
1,Los Angeles,34.114,-118.4068,5476830,4.775,America/Los_Angeles
2,Chicago,41.8373,-87.6861,3463060,3.019,America/Chicago
3,Philadelphia,40.0076,-75.134,2942800,2.566,America/New_York
4,Dallas-Ft. Worth,32.7938,-96.7659,2713380,2.366,America/Chicago


### Change the Layout of the DMA Lat/Long Data to Find Closest Station by Calculating Haversine Distance

In [21]:
def make_dma_latlon_list(dma):
    """
    Makes the DMA lat/long data into a list of dictionaries
    
    :param dma: The DMA data
    :returns: DMA lat/long data as a list of dictionaries
    """
    
    dma_latlon_list = dma[["dma", "lat", "lon"]].to_dict(orient="records")
    return dma_latlon_list

In [22]:
dma_latlon_list = make_dma_latlon_list(dma)
dma_latlon_list[:5]

[{'dma': 'New York', 'lat': 40.6943, 'lon': -73.9249},
 {'dma': 'Los Angeles', 'lat': 34.114, 'lon': -118.4068},
 {'dma': 'Chicago', 'lat': 41.8373, 'lon': -87.6861},
 {'dma': 'Philadelphia', 'lat': 40.0076, 'lon': -75.134},
 {'dma': 'Dallas-Ft. Worth', 'lat': 32.7938, 'lon': -96.7659}]

## Step 4.1: Filter NOAA stations to get just US Locations and Find the Closest DMA for each Station

In [23]:
# http://en.wikipedia.org/wiki/Extreme_points_of_the_United_States#Westernmost
top = 49.3457868 # north lat
left = -124.7844079 # west long
right = -66.9513812 # east long
bottom =  24.7433195 # south lat

def get_us_noaa_locations(noaa_latlon_data_list):
    """
    Gets the NOAA Stations data for stations just in the United States
    
    :param noaa_latlon_data_list: The list containing the lat/long data for NOAA stations
    :returns: List of NOAA stations in the United States
    """
    
    # US NOAA locations list
    us_noaa_locations = []
    
    # check if each location is in the US
    for i in range(len(noaa_latlon_data_list)):
        cur_dict = noaa_latlon_data_list[i]
        lat = cur_dict["lat"]
        lon = cur_dict["lon"]
        if bottom <= lat <= top and left <= lon <= right:
            us_noaa_locations.append(cur_dict)
    return us_noaa_locations

In [24]:
us_noaa_locations = get_us_noaa_locations(noaa_latlon_data_list)

In [25]:
display(len(us_noaa_locations))
us_noaa_locations[:2]

26288

[{'Station ID': 'BF1FP000001',
  'Observation Type': 'PRCP',
  'Observation Value': 0,
  'lat': 26.5481,
  'lon': -78.7028,
  'Elevation': 2.1},
 {'Station ID': 'BF1FP000001',
  'Observation Type': 'SNOW',
  'Observation Value': 0,
  'lat': 26.5481,
  'lon': -78.7028,
  'Elevation': 2.1}]

## Step 4.2: Calculate Haversine Distance to Find the Closest DMA for each Station
## Step 4.3: Convert the NOAA & DMA data from a Dictionary to a DataFrame 

In [26]:
# Code from https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
def calculate_haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculates the Haversine distance between an NOAA station and a DMA
    
    :param lat1: Latitude of the first point
    :param lon1: Longitude of the first point
    :param lat2: Latitude of the second point
    :param lon2: Longitude of the second point
    :returns: Haversine distance between two points
    """
    
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [27]:
# Code from https://stackoverflow.com/questions/41336756/find-the-closest-latitude-and-longitude
def get_closest_dma(dma_latlon_list, cur_noaa_location):
    """
    Gets the closest DMA for a particular NOAA station
    
    :param dma_latlon_list: List of each DMA and its Lat/Long pair
    :param cur_noaa_location: Dictionary containing info about a particular NOAA station
    :returns: Dictionary containing data on the DMA closest to the NOAA station 
    """
    
    return min(dma_latlon_list, key=lambda p: calculate_haversine_distance(
        cur_noaa_location['lat'],cur_noaa_location['lon'],p['lat'],p['lon']))

In [28]:
def merge_dictionaries(dict1, dict2):
    """
    Merges two dictionaries
    
    :param dict1: Dictionary containing NOAA data
    :param dict2: Dictionary containing DMA data
    :returns: Merged dictionary containing NOAA & DMA data 
    """
    
    merged_dict = dict2.update(dict1)
    return merged_dict

In [29]:
def get_noaa_dma_data(us_noaa_locations, dma_latlon_list):
    """
    Converts the NOAA & DMA data from a Dictionary to a DataFrame
    
    :param us_noaa_locations: NOAA locations located in the US
    :param dma_latlon_list: List containing the Lat/Long pairs for each DMA
    :returns: DataFrame containing NOAA & DMA data
    """
    
    # creates the DataFrame
    noaa_dma_data = pd.DataFrame()
    
    # inputs each entry to the DataFrame 
    for i in range(len(us_noaa_locations)):
        cur_us_location = us_noaa_locations[i]
        closest_dict = get_closest_dma(dma_latlon_list, cur_us_location)
        merge_dictionaries(cur_us_location, closest_dict)
        noaa_dma_data = noaa_dma_data.append(closest_dict, ignore_index=True)
    
    # rename columns and change the order
    col_names = ["Elevation", "Observation Type", "Observation Value", "Station ID",
        "DMA", "Latitude", "Longitude"]
    noaa_dma_data.columns = col_names
    col_order = ["Station ID", "Observation Type", "Observation Value", "Elevation",
        "DMA", "Latitude", "Longitude"]
    noaa_dma_data = noaa_dma_data[col_order]
    return noaa_dma_data

In [30]:
noaa_dma_data = get_noaa_dma_data(us_noaa_locations, dma_latlon_list)

In [31]:
display(noaa_dma_data.shape)
noaa_dma_data.head()

(26288, 7)

Unnamed: 0,Station ID,Observation Type,Observation Value,Elevation,DMA,Latitude,Longitude
0,BF1FP000001,PRCP,0.0,2.1,West Palm Beach-Ft. Pierce,26.5481,-78.7028
1,BF1FP000001,SNOW,0.0,2.1,West Palm Beach-Ft. Pierce,26.5481,-78.7028
2,BF1NP000001,PRCP,0.0,5.8,West Palm Beach-Ft. Pierce,24.9975,-77.4499
3,CA1BC000018,PRCP,30.0,168.2,Seattle-Tacoma,49.2979,-122.8464
4,CA1BC000032,PRCP,5.0,152.4,Seattle-Tacoma,49.259,-122.8591


### Function to Make Columns to Determine if it's Raining or Snowing

In [32]:
def determine_raining_or_snowing(noaa_dma_data, threshold):
    """
    Makes columns to determine if it's raining or snowing
    
    :param noaa_dma_data: Data from the NOAA and DMA data sources
    :param threshold: Number that determines the amount of rain/snow (in mm) to have the weather be considered rainy/snowy
    :returns: DataFrame containing 2 more columns of booleans determining if the weather is rainy/snowy
    """
    
    # determines if it's raining
    noaa_dma_data["Is Raining"] = np.where((
        noaa_dma_data["Observation Type"] == "PRCP") & (
        noaa_dma_data["Observation Value"] > threshold), True, False)
    
    # determines if it's snowing
    noaa_dma_data["Is Snowing"] = np.where((
        noaa_dma_data["Observation Type"] == "SNOW") & (
        noaa_dma_data["Observation Value"] > threshold), True, False)
    return noaa_dma_data

In [33]:
threshold = 3
noaa_dma_weather_data = determine_raining_or_snowing(noaa_dma_data, threshold)

In [34]:
display(noaa_dma_weather_data.shape)
noaa_dma_weather_data.head()

(26288, 9)

Unnamed: 0,Station ID,Observation Type,Observation Value,Elevation,DMA,Latitude,Longitude,Is Raining,Is Snowing
0,BF1FP000001,PRCP,0.0,2.1,West Palm Beach-Ft. Pierce,26.5481,-78.7028,False,False
1,BF1FP000001,SNOW,0.0,2.1,West Palm Beach-Ft. Pierce,26.5481,-78.7028,False,False
2,BF1NP000001,PRCP,0.0,5.8,West Palm Beach-Ft. Pierce,24.9975,-77.4499,False,False
3,CA1BC000018,PRCP,30.0,168.2,Seattle-Tacoma,49.2979,-122.8464,True,False
4,CA1BC000032,PRCP,5.0,152.4,Seattle-Tacoma,49.259,-122.8591,True,False


### Function to Save the data locally

In [35]:
def save_data(noaa_dma_weather_data, saves_data, save_data_filename, returns_dict):
    """
    Saves the NOAA and DMA weather data locally
    
    :param noaa_dma_weather_data: Data from the NOAA and DMA data sources and whether it's rainy/snowy
    :param saves_data: Boolean to determine whether to save the data locally or not
    :param save_data_filename: File name of the saved data file
    :param returns_dict: Boolean to determine whether to return the data as a dictionary
    :returns: DataFrame (or Dictionary) containing just the DMA, Is Raining, and Is Snowing columns
    """
    
    # check if the user wants to save the data
    if saves_data:
        print("Saving data to", save_data_filename)
        noaa_dma_weather_data.to_csv(save_data_filename, index=False)
        
    # check if the user wants the output type to be a dictionary
    if returns_dict:
        print("Returning data as a dictionary")
        noaa_dma_weather_data = noaa_dma_weather_data.to_dict(orient="records")
        
    return noaa_dma_weather_data

In [36]:
dma_weather_data = save_data(noaa_dma_weather_data, False, "dma_weather_data.csv", False)

In [37]:
display(dma_weather_data.shape)
dma_weather_data.head()

(26288, 9)

Unnamed: 0,Station ID,Observation Type,Observation Value,Elevation,DMA,Latitude,Longitude,Is Raining,Is Snowing
0,BF1FP000001,PRCP,0.0,2.1,West Palm Beach-Ft. Pierce,26.5481,-78.7028,False,False
1,BF1FP000001,SNOW,0.0,2.1,West Palm Beach-Ft. Pierce,26.5481,-78.7028,False,False
2,BF1NP000001,PRCP,0.0,5.8,West Palm Beach-Ft. Pierce,24.9975,-77.4499,False,False
3,CA1BC000018,PRCP,30.0,168.2,Seattle-Tacoma,49.2979,-122.8464,True,False
4,CA1BC000032,PRCP,5.0,152.4,Seattle-Tacoma,49.259,-122.8591,True,False


### Function to Add the Rain Percentage and Snow Percentage Columns

In [38]:
def add_percentage_columns(dma_weather_data):
    """
    Adds the Rain Percentage and Snow Percentage columns
    
    :param dma_weather_data: Data from the NOAA and DMA data sources and whether it's rainy/snowy
    :returns: Updated DataFrame with the percentage columns
    """
    
    bins = pd.cut(dma_weather_data["Elevation"], 5).unique().dropna()
    
    # add columns with 0 as default value
    dma_weather_data["Snow Percentage"] = 0
    dma_weather_data["Rain Percentage"] = 0

    ### First Range
    # find all rows that fulfill your conditions and set percentage to 20
    dma_weather_data.loc[(dma_weather_data["Elevation"] > bins[0].left) & # if elevation is greater than left bound
            (dma_weather_data["Elevation"] <= bins[0].right), # if elevation is less than or equal to right bound
           "Snow Percentage"] = 20 # then set percentage to 20
    # find all rows that fulfill your conditions and set percentage to 20
    dma_weather_data.loc[(dma_weather_data["Elevation"] > bins[0].left) & # if elevation is greater than left bound
            (dma_weather_data["Elevation"] <= bins[0].right), # if elevation is less than or equal to right bound
           "Rain Percentage"] = 20 # then set percentage to 20

    ### Second Range
    # find all rows that fulfill your conditions and set percentage to 40
    dma_weather_data.loc[(dma_weather_data["Elevation"] > bins[1].left) & # if elevation is greater than left bound
            (dma_weather_data["Elevation"] <= bins[1].right), # if elevation is less than or equal to right bound
           "Snow Percentage"] = 40 # then set percentage to 40
    # find all rows that fulfill your conditions and set percentage to 40
    dma_weather_data.loc[(dma_weather_data["Elevation"] > bins[1].left) & # if elevation is greater than left bound
            (dma_weather_data["Elevation"] <= bins[1].right), # if elevation is less than or equal to right bound
           "Rain Percentage"] = 40 # then set percentage to 40

    ### Third Range
    # find all rows that fulfill your conditions and set percentage to 60
    dma_weather_data.loc[(dma_weather_data["Elevation"] > bins[2].left) & # if elevation is greater than left bound
            (dma_weather_data["Elevation"] <= bins[2].right), # if elevation is less than or equal to right bound
           "Snow Percentage"] = 60 # then set percentage to 60
    # find all rows that fulfill your conditions and set percentage to 60
    dma_weather_data.loc[(dma_weather_data["Elevation"] > bins[2].left) & # if elevation is greater than left bound
            (dma_weather_data["Elevation"] <= bins[2].right), # if elevation is less than or equal to right bound
           "Rain Percentage"] = 60 # then set percentage to 60

    ### Fourth Range
    # find all rows that fulfill your conditions and set percentage to 80
    dma_weather_data.loc[(dma_weather_data["Elevation"] > bins[3].left) & # if elevation is greater than left bound
            (dma_weather_data["Elevation"] <= bins[3].right), # if elevation is less than or equal to right bound
           "Snow Percentage"] = 80 # then set percentage to 80
    # find all rows that fulfill your conditions and set percentage to 80
    dma_weather_data.loc[(dma_weather_data["Elevation"] > bins[3].left) & # if elevation is greater than left bound
            (dma_weather_data["Elevation"] <= bins[3].right), # if elevation is less than or equal to right bound
           "Rain Percentage"] = 80 # then set percentage to 80

    ### Fifth Range
    # find all rows that fulfill your conditions and set percentage to 100
    dma_weather_data.loc[(dma_weather_data["Elevation"] > bins[4].left) & # if elevation is greater than left bound
            (dma_weather_data["Elevation"] <= bins[4].right), # if elevation is less than or equal to right bound
           "Snow Percentage"] = 100 # then set percentage to 100
    # find all rows that fulfill your conditions and set percentage to 100
    dma_weather_data.loc[(dma_weather_data["Elevation"] > bins[4].left) & # if elevation is greater than left bound
            (dma_weather_data["Elevation"] <= bins[4].right), # if elevation is less than or equal to right bound
           "Rain Percentage"] = 100 # then set percentage to 100
    
    return dma_weather_data

In [39]:
dma_weather_data = add_percentage_columns(dma_weather_data)

In [40]:
dma_weather_data.head()

Unnamed: 0,Station ID,Observation Type,Observation Value,Elevation,DMA,Latitude,Longitude,Is Raining,Is Snowing,Snow Percentage,Rain Percentage
0,BF1FP000001,PRCP,0.0,2.1,West Palm Beach-Ft. Pierce,26.5481,-78.7028,False,False,20,20
1,BF1FP000001,SNOW,0.0,2.1,West Palm Beach-Ft. Pierce,26.5481,-78.7028,False,False,20,20
2,BF1NP000001,PRCP,0.0,5.8,West Palm Beach-Ft. Pierce,24.9975,-77.4499,False,False,20,20
3,CA1BC000018,PRCP,30.0,168.2,Seattle-Tacoma,49.2979,-122.8464,True,False,20,20
4,CA1BC000032,PRCP,5.0,152.4,Seattle-Tacoma,49.259,-122.8591,True,False,20,20


## Step 5: Get the Data of whether it is rainy/snowy for a particular DMA

In [41]:
def get_weather_data_from_dma(dma_weather_data, dma_name, read_data_locally, local_data_filename):
    """
    Gets the data of whether it is rainy/snowy for a particular DMA
    
    :param dma_weather_data: Data from the DMA data source and whether it's rainy/snowy
    :param dma_name: Name of the DMA
    :param read_data_locally: Boolean to determine whether to read in the data locally
    :param local_data_filename: Name of the local data file containing DMA data and data whether it's rainy/snowy
    :returns: Dictionary containing the DMA, Is Raining, and Is Snowing keys
    """
    
    # checks if the user wants to read in data locally
    if read_data_locally:
        print("Reading in data locally..")
        dma_weather_data = pd.read_csv(local_data_filename)
    
    # tries to find an exact match for the DMA name
    dma_name_data = dma_weather_data[dma_weather_data["DMA"] == (dma_name)]
    
    # if there is no exact match, try to find similar matches
    if len(dma_name_data) == 0:
        print("There is no exact match. Finding similar matches instead..")
        dma_name_data = dma_weather_data[dma_weather_data["DMA"].str.contains(dma_name)]
        
    # gets the real DMA name and weather data
    real_dma_name = dma_name_data["DMA"].value_counts().index[0]
    is_raining = dma_name_data["Is Raining"].value_counts().index[0]
    is_snowing = dma_name_data["Is Snowing"].value_counts().index[0]
    min_temp = np.mean(dma_name_data[dma_name_data["Observation Type"] == "TMIN"]["Observation Value"]) / 10
    max_temp = np.mean(dma_name_data[dma_name_data["Observation Type"] == "TMAX"]["Observation Value"]) / 10
    min_temp_fahrenheit = round((min_temp * 9/5) + 32, 1)
    max_temp_fahrenheit = round((max_temp * 9/5) + 32, 1)
    
    # gets more weather data
    tobs_temp_obs = np.mean(dma_name_data[dma_name_data["Observation Type"] == "TOBS"]["Observation Value"]) / 10
    dapr_days_prec = round(np.mean(dma_name_data[dma_name_data["Observation Type"] == "DAPR"]["Observation Value"]), 1)
    mdpr_prec_tot = round(np.mean(dma_name_data[dma_name_data["Observation Type"] == "MDPR"]["Observation Value"]) / 10, 1)
    tobs_fahrenheit = round((tobs_temp_obs * 9/5) + 32, 1)
    
    # makes DMA weather dict
    dma_weather_dict = {"DMA": real_dma_name, "Is Raining": is_raining, "Is Snowing": is_snowing, 
                        "Min Temperature (°F)": min_temp_fahrenheit, "Max Temperature (°F)": max_temp_fahrenheit,
                        "Temperature at Time of Observation (°F)": tobs_fahrenheit,
                        "Number of Days in Multiday Precipitation Total": dapr_days_prec,
                        "Multiday Precipitation Total (mm)": mdpr_prec_tot}
    return dma_weather_dict

In [42]:
weather_data = get_weather_data_from_dma(dma_weather_data, "Miami", False, "dma_weather_data.csv")

There is no exact match. Finding similar matches instead..


In [43]:
weather_data

{'DMA': 'Miami-Ft. Lauderdale',
 'Is Raining': False,
 'Is Snowing': False,
 'Min Temperature (°F)': 78.3,
 'Max Temperature (°F)': 93.1,
 'Temperature at Time of Observation (°F)': 81.7,
 'Number of Days in Multiday Precipitation Total': nan,
 'Multiday Precipitation Total (mm)': nan}

## Step 6: Access API to Get the Data of whether it is rainy/snowy for a particular DMA

In [44]:
def access_dma_api(dma_weather_data, dma_name):
    """
    Access the DMA API to get the data of whether it is rainy/snowy for a particular DMA
    
    :param dma_weather_data: Data from the DMA data source and whether it's rainy/snowy
    :param dma_name: Name of the DMA
    :returns: Dictionary containing the DMA, Is Raining, and Is Snowing keys
    """
    
    # tries to find an exact match for the DMA name
    dma_name_data = dma_weather_data[dma_weather_data["DMA"] == (dma_name)]
    
    # if there is no exact match, try to find similar matches
    if len(dma_name_data) == 0:
        print("There is no exact match. Finding similar matches instead...")
        dma_name_data = dma_weather_data[dma_weather_data["DMA"].str.contains(dma_name)]
        
    # if there are still no matches, return empty dictionary
    if len(dma_name_data) == 0:
        print("There are no matches for the given DMA name. Please enter a different DMA name.")
        return {}
    
    # gets the real DMA name and weather data
    real_dma_name = dma_name_data["DMA"].value_counts().index[0]
    is_raining = dma_name_data["Is Raining"].value_counts().index[0]
    is_snowing = dma_name_data["Is Snowing"].value_counts().index[0]
    min_temp = np.mean(dma_name_data[dma_name_data["Observation Type"] == "TMIN"]["Observation Value"]) / 10
    max_temp = np.mean(dma_name_data[dma_name_data["Observation Type"] == "TMAX"]["Observation Value"]) / 10
    min_temp_fahrenheit = round((min_temp * 9/5) + 32, 1)
    max_temp_fahrenheit = round((max_temp * 9/5) + 32, 1)
    
    # gets more weather data
    tobs_temp_obs = np.mean(dma_name_data[dma_name_data["Observation Type"] == "TOBS"]["Observation Value"]) / 10
    dapr_days_prec = round(np.mean(dma_name_data[dma_name_data["Observation Type"] == "DAPR"]["Observation Value"]), 1)
    mdpr_prec_tot = round(np.mean(dma_name_data[dma_name_data["Observation Type"] == "MDPR"]["Observation Value"]) / 10, 1)
    tobs_fahrenheit = round((tobs_temp_obs * 9/5) + 32, 1)
    
    # gets elevation and rain/snow percentages
    elevation = dma_name_data["Elevation"].value_counts().index[0]
    rain_percentage = dma_name_data["Rain Percentage"].value_counts().index[0]
    snow_percentage = dma_name_data["Snow Percentage"].value_counts().index[0]
    
    # makes DMA weather dict
    dma_weather_dict = {"DMA": real_dma_name, "Is Raining": is_raining, "Is Snowing": is_snowing, 
                        "Min Temperature (°F)": min_temp_fahrenheit, "Max Temperature (°F)": max_temp_fahrenheit,
                        "Temperature at Time of Observation (°F)": tobs_fahrenheit,
                        "Number of Days in Multiday Precipitation Total": dapr_days_prec,
                        "Multiday Precipitation Total (mm)": mdpr_prec_tot,
                        "Elevation": elevation, "Rain Percentage": rain_percentage, "Snow Percentage": snow_percentage}
    return dma_weather_dict

In [45]:
dma_name = "San Diego"
api_data = access_dma_api(dma_weather_data, dma_name)

In [46]:
api_data

{'DMA': 'San Diego',
 'Is Raining': False,
 'Is Snowing': False,
 'Min Temperature (°F)': 66.9,
 'Max Temperature (°F)': 98.2,
 'Temperature at Time of Observation (°F)': 88.0,
 'Number of Days in Multiday Precipitation Total': nan,
 'Multiday Precipitation Total (mm)': nan,
 'Elevation': 1079.0,
 'Rain Percentage': 20,
 'Snow Percentage': 20}

# Machine Learning Project

## Step 1: Web scrape Amazon for products used in rainy or snowy weather

In [47]:
def combine_features(row):
    """
    Combine the text features into one string
    
    :param row: The current row in the DataFrame
    :returns: Text features in one string
    """
    
    try:
        return row['Product Code'] +" "+row['Product Title']+" "+str(row["Price"])
    except:
        print("Error:", row)

In [48]:
def get_products_data():
    """
    Gets the Amazon products data
    
    :returns: DataFrame containing Amazon products
    """
    
    # reads in data and adds columns to determine if the product is for snow or rain
    amazon_products = pd.read_excel(
        "search_history/SEARCH_HISTORY_2020-07-29 19h13m.xlsx", encoding="utf-8", errors="ignore")
    amazon_products["is_snow_product"] = amazon_products["code"].apply(
        lambda prod_code: True if "snow" in prod_code else False)
    amazon_products["is_rain_product"] = amazon_products["code"].apply(
        lambda prod_code: True if "rain" in prod_code else False)
    
    # get relevant columns
    relevant_cols = ["code", "url", "title", "price", "review_score", "review_count", "stock", "rank", "category"]
    amazon_products = amazon_products[relevant_cols]
    
    # add columns with 0 as default value
    amazon_products["Snow Percentage"] = 0
    amazon_products["Rain Percentage"] = 0

    ### First Range
    # find all rows that fulfill your conditions and set percentage to 100
    amazon_products.loc[(amazon_products["code"].str.contains("snow")) & # if item is for snow
            (amazon_products["rank"] == 1), # if rank is the specified value
           "Snow Percentage"] = 100 # then set percentage to 100
    amazon_products.loc[(amazon_products["code"].str.contains("rain")) & # if item is for rain
            (amazon_products["rank"] == 1), # if rank is the specified value
           "Rain Percentage"] = 100 # then set percentage to 100

    ### Second Range
    # find all rows that fulfill your conditions and set percentage to 80
    amazon_products.loc[(amazon_products["code"].str.contains("snow")) & # if item is for snow
            (amazon_products["rank"] == 2), # if rank is the specified value
           "Snow Percentage"] = 80 # then set percentage to 80
    amazon_products.loc[(amazon_products["code"].str.contains("rain")) & # if item is for rain
            (amazon_products["rank"] == 2), # if rank is the specified value
           "Rain Percentage"] = 80 # then set percentage to 80

    ### Third Range
    # find all rows that fulfill your conditions and set percentage to 60
    amazon_products.loc[(amazon_products["code"].str.contains("snow")) & # if item is for snow
            (amazon_products["rank"] == 3), # if rank is the specified value
           "Snow Percentage"] = 60 # then set percentage to 60
    amazon_products.loc[(amazon_products["code"].str.contains("rain")) & # if item is for rain
            (amazon_products["rank"] == 3), # if rank is the specified value
           "Rain Percentage"] = 60 # then set percentage to 60

    ### Fourth Range
    # find all rows that fulfill your conditions and set percentage to 40
    amazon_products.loc[(amazon_products["code"].str.contains("snow")) & # if item is for snow
            (amazon_products["rank"] == 4), # if rank is the specified value
           "Snow Percentage"] = 40 # then set percentage to 40
    amazon_products.loc[(amazon_products["code"].str.contains("rain")) & # if item is for rain
            (amazon_products["rank"] == 4), # if rank is the specified value
           "Rain Percentage"] = 40 # then set percentage to 40

    ### Fifth Range
    # find all rows that fulfill your conditions and set percentage to 20
    amazon_products.loc[(amazon_products["code"].str.contains("snow")) & # if item is for snow
            (amazon_products["rank"] == 5), # if rank is the specified value
           "Snow Percentage"] = 20 # then set percentage to 20
    amazon_products.loc[(amazon_products["code"].str.contains("rain")) & # if item is for rain
            (amazon_products["rank"] == 5), # if rank is the specified value
           "Rain Percentage"] = 20 # then set percentage to 20
    
    # rename columns and remove snow products for now
    col_names = ["Product Code", "Product URL", "Product Title", "Price", 
                 "Review Score", "Review Count", "Stock", "Rank", "Category", 
                 "Snow Percentage", "Rain Percentage"]
    amazon_products.columns = col_names
    amazon_products = amazon_products[~(amazon_products["Category"].str.contains("snow"))].reset_index(drop=True)
    
    # makes index column and gets numerical features
    amazon_products['index'] = amazon_products.index
    numerical_features = ["Price","Review Score","Review Count"]

    # fills each NaN value with 0 for each numerical feature
    for feature in numerical_features:
        amazon_products[feature] = amazon_products[feature].fillna(0.0)
        
    # combines each feature into one column
    amazon_products["combined_features"] = amazon_products.apply(combine_features,axis=1)
    
    return amazon_products

In [49]:
amazon_products = get_products_data()

In [50]:
display(amazon_products.shape)
amazon_products.head(2)

(29, 13)

Unnamed: 0,Product Code,Product URL,Product Title,Price,Review Score,Review Count,Stock,Rank,Category,Snow Percentage,Rain Percentage,index,combined_features
0,repel_rain_umbrella,https://www.amazon.com/Repel-Windproof-Travel-...,Repel Umbrella Windproof Double Vented Travel ...,0.0,0.0,0.0,Available,1,rain_umbrella,0,100,0,repel_rain_umbrella Repel Umbrella Windproof D...
1,g4free_rain_umbrella,https://www.amazon.com/G4Free-Automatic-Windpr...,G4Free 54/62/68 Inch Automatic Open Golf Umbre...,0.0,0.0,0.0,Available,2,rain_umbrella,0,80,1,g4free_rain_umbrella G4Free 54/62/68 Inch Auto...


## Step 2: Perform Feature Engineering
- Remove all the NaN values
- Combine all the fields into a large string and put into new column called features
- Remove the stop words, or articles
- Compute count matrix
- Compute cosine similarity

In [51]:
def get_title_from_index(index):
    """
    Gets the product title from the index
    
    :param index: The row index
    :returns: Product titles
    """
    
    return amazon_products[amazon_products.index == index]["Product Title"].values[0]

def get_category_from_index(index):
    """
    Gets the product category from the index
    
    :param index: The row index
    :returns: Product categories
    """
    
    return amazon_products[amazon_products.index == index]["Category"].values[0]

def get_home(index):
    """
    Gets the product URL from the index
    
    :param index: The row index
    :returns: Product URLs
    """
    
    return amazon_products[amazon_products.index == index]["Product URL"].values[0]

def get_index_from_rain_percentage(rain_percentage):
    """
    Gets the rain percentage from the index
    
    :param index: The row index
    :returns: Rain percentages
    """
    
    df = amazon_products[amazon_products["Rain Percentage"] == rain_percentage].index.values[0]
    return df

In [52]:
def get_product_recommendations(amazon_products, api_data):
    """
    Access the DMA API to get the data of whether it is rainy/snowy for a particular DMA
    
    :param amazon_products: Data of Amazon products
    :param api_data: API data
    :returns: Dictionary containing the DMA, weather, and products data
    """
    
    # makes index column and numerical features list
    amazon_products['index'] = amazon_products.index
    numerical_features = ["Price","Review Score","Review Count"]

    # fill NaN values with 0
    for feature in numerical_features:
        amazon_products[feature] = amazon_products[feature].fillna(0.0)
        
    # perform feature engineering to get text features in one column
    amazon_products["combined_features"] = amazon_products.apply(combine_features,axis=1)
    stop_words = stopwords.words('english')
    amazon_products['combined_features'] = amazon_products['combined_features'].str.lower().str.split()
    amazon_products["features"] = amazon_products["combined_features"].apply(
        lambda x: [word for word in x if word not in stop_words])
    amazon_products["features"] = amazon_products["features"].apply(lambda x: " ".join(x))
    
    # compute count matrix and cosine similarity
    cv = CountVectorizer()
    count_matrix = cv.fit_transform(amazon_products["features"])
    cosine_sim = cosine_similarity(count_matrix)
    cur_rain_percentage = api_data["Rain Percentage"]
    
    # get recommended products for the rain
    rain_product_index = get_index_from_rain_percentage(cur_rain_percentage)
    rain_similar_products =  list(enumerate(cosine_sim[rain_product_index]))
    rain_sorted_similar_products = sorted(rain_similar_products,key=lambda x:x[1],reverse=True)
    
    # number of recommended items
    num_recommended_items = 6
    i = 1
    
    # lists for product info
    product_names = []
    product_links = []
    product_categories = []
    cosine_sim_values = []
    
    # get the top recommended products
    for element in rain_sorted_similar_products:
        cur_product_name = get_title_from_index(element[0])
        cur_product_link = get_home(element[0])
        cur_product_category = get_category_from_index(element[0])
        i += 1
        
        if cur_product_category in product_categories:
            i -= 1
        else:
            product_names.append(cur_product_name)
            product_links.append(cur_product_link)
            product_categories.append(cur_product_category)
            cosine_sim_values.append(round(element[1], 2))

        if i > num_recommended_items:
            break
            
    # makes recommended products DataFrame
    recommended_products_df = pd.DataFrame(np.column_stack(
        [product_names, product_links, product_categories]), 
        columns = ["Product Name", "Product Link", "Product Category"])
    return recommended_products_df

In [53]:
recommended_products = get_product_recommendations(amazon_products, api_data)
recommended_products

Unnamed: 0,Product Name,Product Link,Product Category
0,STROMBERGBRAND UMBRELLAS Spectrum Popular Styl...,https://www.amazon.com/StrombergBrand-Spectrum...,rain_umbrella
1,"Sharpty Inverted Umbrella, Umbrella Windproof,...",https://www.amazon.com/Sharpty-Inverted-Windpr...,inverted_rain_umbrella
2,Sloggers Women's Waterproof Rain and Garden Bo...,https://www.amazon.com/Sloggers-impermeables-l...,womens_rain_boots
3,"Servus XTP 15"" PVC Chemical-Resistant Steel To...",https://www.amazon.com/Servus-Chemical-Resista...,mens_rain_boots
4,Columbia Women's Switchback Lined Long Jacket,https://www.amazon.com/Columbia-Womens-Switchb...,womens_rain_jacket
5,The North Face Men's Resolve Waterproof Jacket,https://www.amazon.com/North-Face-Mens-Resolve...,mens_rain_jacket


## Step 3: Make product links as hyperlinks
## Step 4: Get product recommendations

In [54]:
def access_product_recommendations_api(dma_weather_data, dma_name, amazon_products):
    """
    Access the Product Recommendations API to get product recommendations based off DMA weather data
    
    :param dma_weather_data: Data from the DMA data source and whether it's rainy/snowy
    :param dma_name: Name of the DMA
    :param amazon_products: Data of Amazon products
    :returns: DataFrame containing product recommendations
    """
    
    # tries to find an exact match for the DMA name
    dma_name_data = dma_weather_data[dma_weather_data["DMA"] == (dma_name)]
    
    # if there is no exact match, try to find similar matches
    if len(dma_name_data) == 0:
        print("There is no exact match. Finding similar matches instead...")
        dma_name_data = dma_weather_data[dma_weather_data["DMA"].str.contains(dma_name)]
    
    # if there are still no matches, return empty dictionary
    if len(dma_name_data) == 0:
        print("There are no matches for the given DMA name. Please enter a different DMA name.")
        return {}
    
    # gets the real DMA name and weather data
    real_dma_name = dma_name_data["DMA"].value_counts().index[0]
    is_raining = dma_name_data["Is Raining"].value_counts().index[0]
    is_snowing = dma_name_data["Is Snowing"].value_counts().index[0]
    min_temp = np.mean(dma_name_data[dma_name_data["Observation Type"] == "TMIN"]["Observation Value"]) / 10
    max_temp = np.mean(dma_name_data[dma_name_data["Observation Type"] == "TMAX"]["Observation Value"]) / 10
    min_temp_fahrenheit = round((min_temp * 9/5) + 32, 1)
    max_temp_fahrenheit = round((max_temp * 9/5) + 32, 1)
       
    # gets more weather data
    tobs_temp_obs = np.mean(dma_name_data[dma_name_data["Observation Type"] == "TOBS"]["Observation Value"]) / 10
    dapr_days_prec = round(np.mean(dma_name_data[dma_name_data["Observation Type"] == "DAPR"]["Observation Value"]), 1)
    mdpr_prec_tot = round(np.mean(dma_name_data[dma_name_data["Observation Type"] == "MDPR"]["Observation Value"]) / 10, 1)
    tobs_fahrenheit = round((tobs_temp_obs * 9/5) + 32, 1)
    
    # gets elevation and rain/snow percentages
    elevation = dma_name_data["Elevation"].value_counts().index[0]
    rain_percentage = dma_name_data["Rain Percentage"].value_counts().index[0]
    snow_percentage = dma_name_data["Snow Percentage"].value_counts().index[0]
    
    # makes DMA weather dict
    dma_weather_dict = {"DMA": real_dma_name, "Is Raining": is_raining, "Is Snowing": is_snowing, 
                        "Min Temperature (°F)": min_temp_fahrenheit, "Max Temperature (°F)": max_temp_fahrenheit,
                        "Temperature at Time of Observation (°F)": tobs_fahrenheit,
                        "Number of Days in Multiday Precipitation Total": dapr_days_prec,
                        "Multiday Precipitation Total (mm)": mdpr_prec_tot,
                        "Elevation": elevation, "Rain Percentage": rain_percentage, "Snow Percentage": snow_percentage}
    
    # gets product recommendations and makes links as hyperlinks
    recommended_products = get_product_recommendations(amazon_products, dma_weather_dict)
    recommended_products["Product Link"] = recommended_products["Product Link"].apply(
        '<a href="{0}/">Item</a>'.format)
    
    # save recommended products as csv and excel files
    print("saving recommended products as csv and excel files")
    recommended_products.to_csv("product_recommendations.csv", index=False)
    recommended_products.to_excel("product_recommendations.xlsx", index=False)
    
    # save recommended products as json file
    print("saving recommended products as json file")
    your_json = recommended_products.to_json(orient="records")
    parsed = json.loads(your_json)
    with open('product_recommendations.json', 'w', encoding='utf-8') as f:
        json.dump(parsed, f, ensure_ascii=False, indent=4)
    with open('json_to_html/product_recommendations.json', 'w', encoding='utf-8') as f:
        json.dump(parsed, f, ensure_ascii=False, indent=4)
    
    return recommended_products

In [55]:
product_recommendations = access_product_recommendations_api(dma_weather_data, dma_name, amazon_products)
product_recommendations

saving recommended products as csv and excel files
saving recommended products as json file


Unnamed: 0,Product Name,Product Link,Product Category
0,STROMBERGBRAND UMBRELLAS Spectrum Popular Styl...,"<a href=""https://www.amazon.com/StrombergBrand...",rain_umbrella
1,"Sharpty Inverted Umbrella, Umbrella Windproof,...","<a href=""https://www.amazon.com/Sharpty-Invert...",inverted_rain_umbrella
2,Sloggers Women's Waterproof Rain and Garden Bo...,"<a href=""https://www.amazon.com/Sloggers-imper...",womens_rain_boots
3,"Servus XTP 15"" PVC Chemical-Resistant Steel To...","<a href=""https://www.amazon.com/Servus-Chemica...",mens_rain_boots
4,Columbia Women's Switchback Lined Long Jacket,"<a href=""https://www.amazon.com/Columbia-Women...",womens_rain_jacket
5,The North Face Men's Resolve Waterproof Jacket,"<a href=""https://www.amazon.com/North-Face-Men...",mens_rain_jacket


## Final Runthrough

In [62]:
dma_name = "San Diego"
product_recommendations = access_product_recommendations_api(dma_weather_data, dma_name, amazon_products)
product_recommendations

saving recommended products as csv and excel files
saving recommended products as json file


Unnamed: 0,Product Name,Product Link,Product Category
0,STROMBERGBRAND UMBRELLAS Spectrum Popular Styl...,"<a href=""https://www.amazon.com/StrombergBrand...",rain_umbrella
1,"Sharpty Inverted Umbrella, Umbrella Windproof,...","<a href=""https://www.amazon.com/Sharpty-Invert...",inverted_rain_umbrella
2,Sloggers Women's Waterproof Rain and Garden Bo...,"<a href=""https://www.amazon.com/Sloggers-imper...",womens_rain_boots
3,"Servus XTP 15"" PVC Chemical-Resistant Steel To...","<a href=""https://www.amazon.com/Servus-Chemica...",mens_rain_boots
4,Columbia Women's Switchback Lined Long Jacket,"<a href=""https://www.amazon.com/Columbia-Women...",womens_rain_jacket
5,The North Face Men's Resolve Waterproof Jacket,"<a href=""https://www.amazon.com/North-Face-Men...",mens_rain_jacket
