In [1]:
%run helper/setup_notebook.ipynb import display_table

Successfully connected to leetcode50 database.


In [2]:
display_table('Patients')

+------------+--------------+--------------+
| patient_id | patient_name |  conditions  |
+------------+--------------+--------------+
|     1      |    Daniel    |  YFEV COUGH  |
|     2      |    Alice     |              |
|     3      |     Bob      | DIAB100 MYOP |
|     4      |    George    | ACNE DIAB100 |
|     5      |    Alain     |   DIAB201    |
+------------+--------------+--------------+


### Find the patient_id, patient_name and conditions of the patients who have Type I Diabetes. Type I Diabetes always starts with DIAB1 prefix.
```
+------------+--------------+--------------+
| patient_id | patient_name | conditions   |
+------------+--------------+--------------+
| 3          | Bob          | DIAB100 MYOP |
| 4          | George       | ACNE DIAB100 | 
+------------+--------------+--------------+
Explanation: Bob and George both have a condition that starts with DIAB1.
```

In [3]:
%%sql 

-- Fails a test case where condition name is `01DIAB18900`

SELECT 
    patient_id,
    patient_name,
    conditions 
FROM Patients 
WHERE conditions LIKE '%DIAB1%'

patient_id,patient_name,conditions
3,Bob,DIAB100 MYOP
4,George,ACNE DIAB100


In [4]:
%%sql 
-- Need to pass the empty space before DIAB1 to match test cases with multiple lines 
SELECT 
    patient_id,
    patient_name,
    conditions
FROM Patients
WHERE conditions LIKE '% DIAB1%' OR conditions LIKE 'DIAB1%'

patient_id,patient_name,conditions
3,Bob,DIAB100 MYOP
4,George,ACNE DIAB100


## Using `regex`

#### `\b` represents a word boundary in regular expressions. It matches a position where a word character (alphanumeric or underscore) is adjacent to a non-word character (such as whitespace, punctuation, or the start/end of a string).

In [5]:
%%sql 

SELECT 
    patient_id,
    patient_name,
    conditions
FROM Patients
WHERE conditions REGEXP '\\bDIAB1'

patient_id,patient_name,conditions
3,Bob,DIAB100 MYOP
4,George,ACNE DIAB100


## Using `SUBSTRING_INDEX`

In [6]:
%%sql

SELECT
    SUBSTRING_INDEX('Sloppy Joe', ' ', 1) AS first_word,
    SUBSTRING_INDEX('Sloppy Joe', ' ', -1) AS last_word;

first_word,last_word
Sloppy,Joe


In [7]:
%%sql 
-- Split the condition on ' '
SELECT 
    SUBSTRING_INDEX(conditions, ' ', 1) AS first_half,
    SUBSTRING_INDEX(conditions, ' ', -1) AS second_half
FROM Patients

first_half,second_half
YFEV,COUGH
,
DIAB100,MYOP
ACNE,DIAB100
DIAB201,DIAB201


In [8]:
%%sql 

SELECT 
    SUBSTR(conditions, 1, 5)
FROM Patients

"SUBSTR(conditions, 1, 5)"
YFEV
DIAB1
ACNE
DIAB2


In [9]:
%%sql
-- This solution fails the test cases 
WITH substring_table AS (
    SELECT 
        patient_id,
        patient_name,
        SUBSTRING_INDEX(conditions, ' ', 1) AS first_half,
        SUBSTRING_INDEX(conditions, ' ', -1) AS second_half,
        conditions
    FROM Patients
)
SELECT 
    patient_id,
    patient_name,
    conditions
FROM substring_table
WHERE SUBSTR(first_half, 1, 5) = 'DIAB1' OR SUBSTR(second_half, 1, 5) = 'DIAB1';

patient_id,patient_name,conditions
3,Bob,DIAB100 MYOP
4,George,ACNE DIAB100


## Using Pandas

In [10]:
patients_query = %sql SELECT * FROM Patients # type: ignore
patients_df = patients_query.DataFrame()

display(patients_df)

Unnamed: 0,patient_id,patient_name,conditions
0,1,Daniel,YFEV COUGH
1,2,Alice,
2,3,Bob,DIAB100 MYOP
3,4,George,ACNE DIAB100
4,5,Alain,DIAB201


In [11]:
patients_df.conditions

0      YFEV COUGH
1                
2    DIAB100 MYOP
3    ACNE DIAB100
4         DIAB201
Name: conditions, dtype: object

In [12]:
patients_df.conditions.str.contains('DIAB1')

0    False
1    False
2     True
3     True
4    False
Name: conditions, dtype: bool

In [13]:
patients_df.conditions.str.contains('DIAB1| DIAB1')

0    False
1    False
2     True
3     True
4    False
Name: conditions, dtype: bool

In [14]:
patients_df.loc[patients_df.conditions.str.contains('DIAB1| DIAB1')]

Unnamed: 0,patient_id,patient_name,conditions
2,3,Bob,DIAB100 MYOP
3,4,George,ACNE DIAB100


### Using `query()`

In [15]:
patients_df.query("conditions.str.contains(' DIAB1|DIAB1')")

Unnamed: 0,patient_id,patient_name,conditions
2,3,Bob,DIAB100 MYOP
3,4,George,ACNE DIAB100
