In [1]:
%run helper/setup_notebook.ipynb import display_table

Successfully connected to leetcode50 database.


In [2]:
display_table('Insurance')

+-----+----------+----------+------+------+
| pid | tiv_2015 | tiv_2016 | lat  | lon  |
+-----+----------+----------+------+------+
|  1  |   10.0   |   5.0    | 10.0 | 10.0 |
|  2  |   20.0   |   20.0   | 20.0 | 20.0 |
|  3  |   10.0   |   30.0   | 20.0 | 20.0 |
|  4  |   10.0   |   40.0   | 40.0 | 40.0 |
+-----+----------+----------+------+------+


### Write an SQL query to report the sum of all total investment values in 2016 tiv_2016, for all policyholders who:

* have the same tiv_2015 value as one or more other policyholders, and
* are not located in the same city like any other policyholder (i.e., the (lat, lon) attribute pairs must be unique).

#### *Round tiv_2016 to two decimal places.*
```
+----------+
| tiv_2016 |
+----------+
| 45.00    |
+----------+
Explanation: 
The first record in the table, like the last record, meets both of the two criteria.
The tiv_2015 value 10 is the same as the third and fourth records, and its location is unique.

The second record does not meet any of the two criteria. Its tiv_2015 is not like any other policyholders and its location is the same as the third record, which makes the third record fail, too.
So, the result is the sum of tiv_2016 of the first and last record, which is 45.
```


In [3]:
%%sql 

SELECT 
    tiv_2015
FROM Insurance
GROUP BY tiv_2015
HAVING COUNT(tiv_2015) > 1


tiv_2015
10.0


In [4]:
%%sql 

SELECT 
    pid,
    SUM(tiv_2016)
FROM Insurance
GROUP BY pid


pid,SUM(tiv_2016)
1,5.0
2,20.0
3,30.0
4,40.0


In [5]:
%%sql 

SELECT 
    lat,
    lon 
FROM Insurance
GROUP BY lat, lon
HAVING COUNT(lat) = 1 AND COUNT(lon) = 1

lat,lon
10.0,10.0
40.0,40.0


In [6]:
%%sql 

SELECT 
    pid,
    SUM(tiv_2016)
FROM Insurance
WHERE (lat, lon) IN (
    SELECT 
        lat,
        lon 
    FROM Insurance
    GROUP BY lat, lon
    HAVING COUNT(lat) = 1 AND COUNT(lon) = 1
)
GROUP BY pid

pid,SUM(tiv_2016)
1,5.0
4,40.0


In [7]:
%%sql 

SELECT 
    ROUND(SUM(tiv_2016), 2) AS tiv_2016
FROM Insurance
WHERE (lat, lon) IN (
    SELECT 
        lat,
        lon 
    FROM Insurance
    GROUP BY lat, lon
    HAVING COUNT(lat) = 1 AND COUNT(lon) = 1
) AND tiv_2015 IN (
    SELECT 
        tiv_2015
    FROM Insurance
    GROUP BY tiv_2015
    HAVING COUNT(tiv_2015) > 1
)

tiv_2016
45.0


# Using Pandas

In [8]:
insurance_query = %sql SELECT * FROM Insurance # type: ignore
insurance_df = insurance_query.DataFrame()

display(insurance_df)

Unnamed: 0,pid,tiv_2015,tiv_2016,lat,lon
0,1,10.0,5.0,10.0,10.0
1,2,20.0,20.0,20.0,20.0
2,3,10.0,30.0,20.0,20.0
3,4,10.0,40.0,40.0,40.0


We need to use the `duplicated()` method with the `keep=False` parameter to create a boolean mask.

```python
        mask = insurance_df.duplicated(subset=['lat', 'lon'], keep=False)
        filtered_df = insurance_df[~mask]
```
The `duplicated(subset=['lat', 'lon'], keep=False)` method generates a boolean mask that identifies rows with duplicate 'lat' and 'lon' values, considering all occurrences (including the original ones). 


In [9]:
insurance_df.duplicated(subset=['lat', 'lon'], keep=False)

0    False
1     True
2     True
3    False
dtype: bool

In [10]:
mask = insurance_df.duplicated(subset=['lat', 'lon'], keep=False)
filtered_df = insurance_df[~mask]
filtered_df

Unnamed: 0,pid,tiv_2015,tiv_2016,lat,lon
0,1,10.0,5.0,10.0,10.0
3,4,10.0,40.0,40.0,40.0


In [11]:
insurance_df.groupby('tiv_2015').filter(lambda x: len(x) > 1)

Unnamed: 0,pid,tiv_2015,tiv_2016,lat,lon
0,1,10.0,5.0,10.0,10.0
2,3,10.0,30.0,20.0,20.0
3,4,10.0,40.0,40.0,40.0


In [12]:
insurance_df.groupby('tiv_2015').filter(lambda x: len(x) > 1)['tiv_2015']

0    10.0
2    10.0
3    10.0
Name: tiv_2015, dtype: float64

In [13]:
unique_tiv_2015_df = insurance_df.groupby('tiv_2015') \
                                    .filter(lambda x: len(x) > 1) \
                                    [['tiv_2015']] \
                                    .drop_duplicates()

unique_tiv_2015_df

Unnamed: 0,tiv_2015
0,10.0


In [14]:
unique_tiv_2015_df.loc[0, 'tiv_2015']

10.0

In [15]:
unique_tiv_2015_value = unique_tiv_2015_df.loc[0, 'tiv_2015']
unique_tiv_2015_value

10.0

In [16]:
filtered_df.query("tiv_2015 == @unique_tiv_2015_value")

Unnamed: 0,pid,tiv_2015,tiv_2016,lat,lon
0,1,10.0,5.0,10.0,10.0
3,4,10.0,40.0,40.0,40.0


In [17]:
filtered_df = filtered_df.query("tiv_2015 == @unique_tiv_2015_value")
filtered_df

Unnamed: 0,pid,tiv_2015,tiv_2016,lat,lon
0,1,10.0,5.0,10.0,10.0
3,4,10.0,40.0,40.0,40.0


In [18]:
filtered_df.groupby('tiv_2016').agg({'tiv_2016':'sum'})

Unnamed: 0_level_0,tiv_2016
tiv_2016,Unnamed: 1_level_1
5.0,5.0
40.0,40.0


In [20]:
filtered_df.groupby('tiv_2015').agg({'tiv_2016': 'sum'}).reset_index()[['tiv_2016']]

Unnamed: 0,tiv_2016
0,45.0
