# Prediction

In [30]:
import pickle
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from io import StringIO
from _datetime import date

from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

## Step 1: Boolean classifier

In [31]:
#Load the model
# filename = '../02MLAlgorithms/models/2021-06-06_Level0_LinearSVC.pkl'
filename = '../../02MLAlgorithms/01hierarchical/models/2021-06-06_Level0_LR.pkl'

nb_model = pickle.load(open(filename, 'rb'))

In [32]:
# for param, value in nb_model.get_params(deep=True).items():
#     print(f"{param} -> {value}")

### Using the ML algorithm to predict the tags from sentences

In [33]:
#testSet reading the sentences that do not have a tag
for i in range(0,11,1):
    print(i)
    dfprediction = pd.read_csv('../../../completeDataset/2021-05-31_dataset'+str(i)+'.csv')
    result = nb_model.predict(dfprediction.sentence)
    dfprediction['predictedTag'] = result
    dfprediction.sort_values(by=['conditionName', 'predictedTag'],inplace=True,ascending=[True,True])
    dfprediction.to_csv('L0results/'+str(date.today())+'_L0_predictedSet'+str(i)+'.csv')
    print('Predictions'+str(i))
    
    dfprediction.reset_index()
    if i == 0 :
        s = dfprediction.groupby(['conditionName','predictedTag'],dropna=False).count()#.agg(['count'])
#         s = pd.DataFrame({'sentences' : df.groupby(['conditionName','predictedTag'])['predictedTag'].sum()}).reset_index()
        
    else:
        s = s.append(dfprediction.groupby(['conditionName','predictedTag'],dropna=False).count())
    print('Stats'+str(i))
    
s.to_csv('L0results/'+str(date.today())+'_L0_GroupedPredicted.csv')

0
Predictions0
Stats0
1
Predictions1
Stats1
2
Predictions2
Stats2
3
Predictions3
Stats3
4
Predictions4
Stats4
5
Predictions5
Stats5
6
Predictions6
Stats6
7
Predictions7
Stats7
8
Predictions8
Stats8
9
Predictions9
Stats9
10
Predictions10
Stats10


# Review statistics

In [34]:
# grouped by number of sentences that are 0 (no CES) or 1 (describe a CES)
dfstats = pd.read_csv('L0results/2021-06-06_L0_GroupedPredicted.csv')

# dfstats = pd.read_csv('L0results/'+str(date.today())+'_L0_GroupedPredicted.csv')
dfstats.info()
dfstats.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3299 entries, 0 to 3298
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   conditionName  3299 non-null   object
 1   predictedTag   3299 non-null   int64 
 2   Unnamed: 0     3299 non-null   int64 
 3   Unnamed: 0.1   3299 non-null   int64 
 4   source         3299 non-null   int64 
 5   sentence       3299 non-null   int64 
 6   clearSentence  3299 non-null   int64 
dtypes: int64(6), object(1)
memory usage: 180.5+ KB


Unnamed: 0.2,conditionName,predictedTag,Unnamed: 0,Unnamed: 0.1,source,sentence,clearSentence
0,ACL injury,0,97,97,97,97,95
1,ACL injury,1,7,7,7,7,7
2,ARDS,0,115,115,115,115,115
3,ARDS,1,3,3,3,3,3
4,Abdominal aortic aneurysm,0,231,231,231,231,231


In [35]:
dfstats.rename(columns={'Unnamed: 0': 'sentences'}, inplace=True)
dfstats = pd.DataFrame({'sentences' : dfstats.groupby(['conditionName','predictedTag'])['sentences'].sum()}).reset_index()
dfstats = dfstats.sort_values(by='conditionName')
dfstats.info()
dfstats.head(10)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3290 entries, 0 to 3289
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   conditionName  3290 non-null   object
 1   predictedTag   3290 non-null   int64 
 2   sentences      3290 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 102.8+ KB


Unnamed: 0,conditionName,predictedTag,sentences
0,ACL injury,0,97
1,ACL injury,1,7
2,ARDS,0,115
3,ARDS,1,3
4,Abdominal aortic aneurysm,0,231
5,Abdominal aortic aneurysm,1,4
6,Abdominal aortic aneurysm screening,0,79
7,Abdominal aortic aneurysm screening,1,3
8,Abortion,0,64
9,Abscess,0,95


In [36]:
# dfstats.to_csv('L0results/2021-06-01_sentencesPerTagCondition.csv',index=False)
dfstats.to_csv('L0results/'+str(date.today())+'_newDatasetBool.csv',index=False)

In [37]:
dftotal = pd.DataFrame({'sentences' : dfstats.groupby(['predictedTag'])['sentences'].sum()}).reset_index()
print(dftotal)
# number of sentences that have no CES = 0, number of sentences that describe condition evolution CES = 1

   predictedTag  sentences
0             0     208838
1             1       5174


In [38]:
#number of tags by condition
dftbc = pd.DataFrame({'tags' : dfstats.groupby(['conditionName'])['predictedTag'].count()}).reset_index()
# dftbc = dfstats['conditionName'].value_counts().reset_index()#.to_dict()
# dftbc.columns = ['conditionName', 'numberOfTags']
print(dftbc)

                            conditionName  tags
0                              ACL injury     2
1                                    ARDS     2
2               Abdominal aortic aneurysm     2
3     Abdominal aortic aneurysm screening     2
4                                Abortion     1
...                                   ...   ...
1846            Yeast infection (vaginal)     1
1847                         Yellow fever     2
1848                                 Yips     1
1849                           Zika virus     2
1850           Zollinger-Ellison syndrome     2

[1851 rows x 2 columns]


In [39]:
#number conditions per tag
dfcpt = dfstats['predictedTag'].value_counts().reset_index()#.to_dict()
dfcpt.columns = ['tagName', 'numberOfConditions']
# dfcpt.to_csv('L0results/2021-06-01_numberOfConditionsPerTag.csv',index=False)

dfcpt.to_csv('L0results/'+str(date.today())+'_numberOfConditionsPerTag.csv',index=False)
print(dfcpt)

   tagName  numberOfConditions
0        0                1851
1        1                1439


### Prepare data for next level

In [49]:
#Filter only the sentences predicted as YES
df = pd.DataFrame()
for i in range(0,11,1):
    print(i)
    #dfpre = pd.read_csv('L0results/2021-01-11_L0_predictedSet'+str(i)+'.csv')
    dfpre = pd.read_csv('L0results/'+str(date.today())+'_L0_predictedSet'+str(i)+'.csv')
#     dfpre = dfpre.query('predictedTag=="YES"')
    dfpre = dfpre.query('predictedTag==1')

    df = df.append(dfpre)
df.to_csv('L0results/'+str(date.today())+'_newDatasetBool.csv',index=False)

0
1
2
3
4
5
6
7
8
9
10


In [50]:
#Filter only the sentences predicted as NO
df = pd.DataFrame()
for i in range(0,11,1):
    print(i)
    #dfpre = pd.read_csv('L0results/2021-01-16_L0_predictedSet'+str(i)+'.csv')
    dfpre = pd.read_csv('L0results/'+str(date.today())+'_L0_predictedSet'+str(i)+'.csv')
#     dfpre = dfpre.query('predictedTag=="NO"')
    dfpre = dfpre.query('predictedTag==0')

    df = df.append(dfpre)
df.to_csv('L0results/'+str(date.today())+'_NOdatasetBool.csv',index=False)

0
1
2
3
4
5
6
7
8
9
10


## Step 2: First level

In [54]:
#Load the model
filename = '../../02MLAlgorithms/01hierarchical/models/2021-06-06_Level1_LR.pkl'
svc_model = pickle.load(open(filename, 'rb'))

### Using the ML algorithm to predict the tags from sentences

In [55]:
#testSet reading the sentences that do not have a tag
#dfpredfl = pd.read_csv('2021-01-16_newDatasetBool.csv')
dfpredfl = pd.read_csv('L0results/'+str(date.today())+'_newDatasetBool.csv')
result = svc_model.predict(dfpredfl.sentence)

dfpredfl['newTagL1'] = result
dfpredfl.sort_values(by=['conditionName', 'predictedTag'],inplace=True,ascending=[True,True])

dfpredfl.reset_index()
dfpredfl.to_csv('L1results/'+str(date.today())+'_L1_predicted.csv',index=False)

In [56]:
dfpredfl.info()
dfpredfl.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5174 entries, 0 to 5173
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      5174 non-null   int64 
 1   Unnamed: 0.1    5174 non-null   int64 
 2   Unnamed: 0.1.1  5174 non-null   int64 
 3   source          5174 non-null   object
 4   conditionName   5174 non-null   object
 5   sentence        5174 non-null   object
 6   clearSentence   5174 non-null   object
 7   predictedTag    5174 non-null   int64 
 8   newTagL1        5174 non-null   object
dtypes: int64(4), object(5)
memory usage: 404.2+ KB


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,source,conditionName,sentence,clearSentence,predictedTag,newTagL1
0,19,19,19,MAYO,ACL injury,ACL injuries often happen during sports and fi...,acl injuries often happen sports fitness activ...,1,IMPROVE
1,24,24,24,MAYO,ACL injury,"When the ligament is damaged, there is usually...",ligament damaged usually partial complete tear...,1,DECLINE
2,25,25,25,MAYO,ACL injury,A mild injury may stretch the ligament but lea...,mild injury may stretch ligament leave intact,1,IMPROVE
3,75,75,82,MAYO,ACL injury,General rest is necessary for healing and limi...,general rest necessary healing limits weight b...,1,IMPROVE
4,96,96,103,MAYO,ACL injury,There's no set time frame for athletes to retu...,theres set time frame athletes return play,1,PERMANENT


In [57]:
#number conditions per tag
dfcpt = dfpredfl['newTagL1'].value_counts().reset_index()#.to_dict()
dfcpt.columns = ['newTagL1', 'numberOfConditions']
#dfcpt.to_csv('results/'+str(date.today())+'_numberOfConditionsPerTag.csv')
print(dfcpt)

    newTagL1  numberOfConditions
0    IMPROVE                3689
1  PERMANENT                 868
2    DECLINE                 617


In [58]:
#number of tags by condition
dftbc = dfpredfl['conditionName'].value_counts().reset_index()#.to_dict()
dftbc.columns = ['conditionName', 'numberOfTags']
print(dftbc)

                     conditionName  numberOfTags
0              Atrial fibrillation            29
1               Multiple sclerosis            25
2     Coronary artery bypass graft            23
3                     Heart attack            20
4            Knee ligament surgery            20
...                            ...           ...
1434                        Scurvy             1
1435                  Eye injuries             1
1436                     Dry mouth             1
1437     Hyperglycemia in diabetes             1
1438           Swollen lymph nodes             1

[1439 rows x 2 columns]


In [59]:
newDataset = dfpredfl.query('newTagL1!="PERMANENT"')
newDataset.to_csv('L1results/'+str(date.today())+'_newDatasetL1.csv',index=False)

In [60]:
newDataset = dfpredfl.query('newTagL1=="PERMANENT"')
newDataset.to_csv('L1results/'+str(date.today())+'_newDatasetPERMANENTL1.csv',index=False)

## Step 3: Second level

In [62]:
#Load the model
filename = '../../02MLAlgorithms/01hierarchical/models/2021-06-06_Level2_DecisionTree.pkl'
dt2_model = pickle.load(open(filename, 'rb'))

### Using the ML algorithm to predict the tags from sentences

In [63]:
dfL2 = pd.read_csv('L1results/2021-06-06_newDatasetL1.csv')
# dfL2 = pd.read_csv('L1results/'+str(date.today())+'_newDatasetL1.csv')
resultL2 = dt2_model.predict(dfL2.sentence)

dfL2['newTagL2'] = resultL2
dfL2.sort_values(by=['conditionName', 'predictedTag'],inplace=True,ascending=[True,True])

dfL2.reset_index()
dfL2.to_csv('L2results/'+str(date.today())+'_L2_predicted.csv',index=False)

In [64]:
dfL2.info()
dfL2.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4306 entries, 0 to 4305
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      4306 non-null   int64 
 1   Unnamed: 0.1    4306 non-null   int64 
 2   Unnamed: 0.1.1  4306 non-null   int64 
 3   source          4306 non-null   object
 4   conditionName   4306 non-null   object
 5   sentence        4306 non-null   object
 6   clearSentence   4306 non-null   object
 7   predictedTag    4306 non-null   int64 
 8   newTagL1        4306 non-null   object
 9   newTagL2        4306 non-null   object
dtypes: int64(4), object(6)
memory usage: 370.0+ KB


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,source,conditionName,sentence,clearSentence,predictedTag,newTagL1,newTagL2
0,19,19,19,MAYO,ACL injury,ACL injuries often happen during sports and fi...,acl injuries often happen sports fitness activ...,1,IMPROVE,FAST
1,24,24,24,MAYO,ACL injury,"When the ligament is damaged, there is usually...",ligament damaged usually partial complete tear...,1,DECLINE,FAST
2,25,25,25,MAYO,ACL injury,A mild injury may stretch the ligament but lea...,mild injury may stretch ligament leave intact,1,IMPROVE,FAST
3,75,75,82,MAYO,ACL injury,General rest is necessary for healing and limi...,general rest necessary healing limits weight b...,1,IMPROVE,FAST
4,97,97,104,MAYO,ACL injury,Recent research indicates that up to one-third...,recent research indicates onethird athletes su...,1,IMPROVE,SLOWLY


In [24]:
#dfL2.to_csv(str(date.today())+'_newDatasetL2.csv',index=False)
#dfL2.to_csv(str(date.today())+'_newDatasetL2.csv',index=False)

## Step 3: Second level

In [65]:
#Load the model
filename = '../../02MLAlgorithms/01hierarchical/models/2021-06-06_Level3_DecisionTree.pkl'
dt3_model = pickle.load(open(filename, 'rb'))

### Using the ML algorithm to predict the tags from sentences

In [68]:
dfL3 = pd.read_csv('L2results/2021-06-06_L2_predicted.csv')
# dfL3 = pd.read_csv('L2results/'+str(date.today())+'_L2_predicted.csv')
resultL3 = dt3_model.predict(dfL3.sentence)

dfL3['newTagL3'] = resultL3
dfL3.sort_values(by=['conditionName', 'predictedTag'],inplace=True,ascending=[True,True])

dfL3.reset_index()
dfL3.to_csv('L3results/'+str(date.today())+'_L3_predicted.csv',index=False)

In [69]:
dfL3.info()
dfL3.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4306 entries, 0 to 4305
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      4306 non-null   int64 
 1   Unnamed: 0.1    4306 non-null   int64 
 2   Unnamed: 0.1.1  4306 non-null   int64 
 3   source          4306 non-null   object
 4   conditionName   4306 non-null   object
 5   sentence        4306 non-null   object
 6   clearSentence   4306 non-null   object
 7   predictedTag    4306 non-null   int64 
 8   newTagL1        4306 non-null   object
 9   newTagL2        4306 non-null   object
 10  newTagL3        4306 non-null   object
dtypes: int64(4), object(7)
memory usage: 403.7+ KB


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,source,conditionName,sentence,clearSentence,predictedTag,newTagL1,newTagL2,newTagL3
0,19,19,19,MAYO,ACL injury,ACL injuries often happen during sports and fi...,acl injuries often happen sports fitness activ...,1,IMPROVE,FAST,FROM 1 YEAR TO MORE YEAR
1,24,24,24,MAYO,ACL injury,"When the ligament is damaged, there is usually...",ligament damaged usually partial complete tear...,1,DECLINE,FAST,FROM 1 YEAR TO MORE YEAR
2,25,25,25,MAYO,ACL injury,A mild injury may stretch the ligament but lea...,mild injury may stretch ligament leave intact,1,IMPROVE,FAST,FROM 1 YEAR TO MORE YEAR
3,75,75,82,MAYO,ACL injury,General rest is necessary for healing and limi...,general rest necessary healing limits weight b...,1,IMPROVE,FAST,FROM 5 MINUTE TO 1 DAY
4,97,97,104,MAYO,ACL injury,Recent research indicates that up to one-third...,recent research indicates onethird athletes su...,1,IMPROVE,SLOWLY,FROM 1 YEAR TO MORE YEAR


## Step 4: Compile results

In [70]:
dfL3 = pd.read_csv('L3results/2021-06-06_L3_predicted.csv')

# dfL3 = pd.read_csv('L3results/'+str(date.today())+'_L3_predicted.csv')
# dfL3.to_csv('L3results/2021-06-01_L3_predicted.csv',index=False)

In [71]:
dfL3['reviewedTag'] = dfL3.newTagL1 + (' ' + dfL3.newTagL2).fillna('')+ (' ' + dfL3.newTagL3).fillna('')
dfL3['reviewedTag'] = dfL3['reviewedTag'].str.strip()
dfL3 = dfL3.drop(columns=['Unnamed: 0', 'Unnamed: 0.1','Unnamed: 0.1.1','clearSentence','predictedTag'])
dfL3.head()

Unnamed: 0,source,conditionName,sentence,newTagL1,newTagL2,newTagL3,reviewedTag
0,MAYO,ACL injury,ACL injuries often happen during sports and fi...,IMPROVE,FAST,FROM 1 YEAR TO MORE YEAR,IMPROVE FAST FROM 1 YEAR TO MORE YEAR
1,MAYO,ACL injury,"When the ligament is damaged, there is usually...",DECLINE,FAST,FROM 1 YEAR TO MORE YEAR,DECLINE FAST FROM 1 YEAR TO MORE YEAR
2,MAYO,ACL injury,A mild injury may stretch the ligament but lea...,IMPROVE,FAST,FROM 1 YEAR TO MORE YEAR,IMPROVE FAST FROM 1 YEAR TO MORE YEAR
3,MAYO,ACL injury,General rest is necessary for healing and limi...,IMPROVE,FAST,FROM 5 MINUTE TO 1 DAY,IMPROVE FAST FROM 5 MINUTE TO 1 DAY
4,MAYO,ACL injury,Recent research indicates that up to one-third...,IMPROVE,SLOWLY,FROM 1 YEAR TO MORE YEAR,IMPROVE SLOWLY FROM 1 YEAR TO MORE YEAR


In [72]:
# col = ['conditionName','sentence','newTagL1','newTagL2','newTagL3','reviewedTag']
# dfL3 = dfL3[col]

# dfL3.columns
# dfL3.head()

In [73]:
#read classes PERMANET
# dfper = pd.read_csv('L1results/2021-06-01_L1_predicted.csv')
dfper = pd.read_csv('L1results/2021-06-06_newDatasetPERMANENTL1.csv')

# dfper = pd.read_csv('L1results/'+str(date.today())+'_newDatasetPERMANENTL1.csv')
dfper['reviewedTag'] = dfper['newTagL1']

In [74]:
col = ['source','conditionName','sentence','newTagL1','reviewedTag']
dfper = dfper[col]

dfper.columns
dfper.head()

Unnamed: 0,source,conditionName,sentence,newTagL1,reviewedTag
0,MAYO,ACL injury,There's no set time frame for athletes to retu...,PERMANENT,PERMANENT
1,NHS,Abdominal aortic aneurysm screening,There's no risk from the screening test itself...,PERMANENT,PERMANENT
2,MAYO,Absence seizure,"Then, there is a quick return to a normal leve...",PERMANENT,PERMANENT
3,MAYO,Achalasia,There's no cure for achalasia.,PERMANENT,PERMANENT
4,NHS,Achalasia,Treatments for achalasia,PERMANENT,PERMANENT


In [75]:
dfL3.info()
dfper.info()
#MERGE L3 PREDICTIONS AND L1 = PERMANENT predictions, this represents the whole dataset
result = dfL3.append(dfper, ignore_index=True, sort=False)
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4306 entries, 0 to 4305
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   source         4306 non-null   object
 1   conditionName  4306 non-null   object
 2   sentence       4306 non-null   object
 3   newTagL1       4306 non-null   object
 4   newTagL2       4306 non-null   object
 5   newTagL3       4306 non-null   object
 6   reviewedTag    4306 non-null   object
dtypes: object(7)
memory usage: 235.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 868 entries, 0 to 867
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   source         868 non-null    object
 1   conditionName  868 non-null    object
 2   sentence       868 non-null    object
 3   newTagL1       868 non-null    object
 4   reviewedTag    868 non-null    object
dtypes: object(5)
memory usage: 34.0+ KB
<class 'pandas.core

In [76]:
result.head()

Unnamed: 0,source,conditionName,sentence,newTagL1,newTagL2,newTagL3,reviewedTag
0,MAYO,ACL injury,ACL injuries often happen during sports and fi...,IMPROVE,FAST,FROM 1 YEAR TO MORE YEAR,IMPROVE FAST FROM 1 YEAR TO MORE YEAR
1,MAYO,ACL injury,"When the ligament is damaged, there is usually...",DECLINE,FAST,FROM 1 YEAR TO MORE YEAR,DECLINE FAST FROM 1 YEAR TO MORE YEAR
2,MAYO,ACL injury,A mild injury may stretch the ligament but lea...,IMPROVE,FAST,FROM 1 YEAR TO MORE YEAR,IMPROVE FAST FROM 1 YEAR TO MORE YEAR
3,MAYO,ACL injury,General rest is necessary for healing and limi...,IMPROVE,FAST,FROM 5 MINUTE TO 1 DAY,IMPROVE FAST FROM 5 MINUTE TO 1 DAY
4,MAYO,ACL injury,Recent research indicates that up to one-third...,IMPROVE,SLOWLY,FROM 1 YEAR TO MORE YEAR,IMPROVE SLOWLY FROM 1 YEAR TO MORE YEAR


In [77]:
result.drop_duplicates(keep = "first", inplace = True)
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5174 entries, 0 to 5173
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   source         5174 non-null   object
 1   conditionName  5174 non-null   object
 2   sentence       5174 non-null   object
 3   newTagL1       5174 non-null   object
 4   newTagL2       4306 non-null   object
 5   newTagL3       4306 non-null   object
 6   reviewedTag    5174 non-null   object
dtypes: object(7)
memory usage: 323.4+ KB


In [78]:
result.sort_values(by=['conditionName','newTagL1','newTagL2','newTagL3'],inplace=True,ascending=[True,True,True,True])

In [79]:
result.to_csv('2021-06-06_resultsPredicted.csv', index=False)
# result.to_csv(str(date.today())+'_resultsPredicted.csv', index=False)

In [80]:
# result = pd.read_csv('2021-01-22_resultsPredicted.csv')
result = pd.read_csv('2021-06-06_resultsPredicted.csv')

# result.groupby(['conditionName'],dropna=False)
temp = result.drop_duplicates(subset=['conditionName'],keep = "first")
len(temp)
# print(total)
# result['conditionName'].value_counts()

1439

In [81]:
result.sort_values(by=['conditionName'],inplace=True,ascending=[True])
result.head(30)

Unnamed: 0,source,conditionName,sentence,newTagL1,newTagL2,newTagL3,reviewedTag
0,MAYO,ACL injury,"When the ligament is damaged, there is usually...",DECLINE,FAST,FROM 1 YEAR TO MORE YEAR,DECLINE FAST FROM 1 YEAR TO MORE YEAR
1,MAYO,ACL injury,ACL injuries often happen during sports and fi...,IMPROVE,FAST,FROM 1 YEAR TO MORE YEAR,IMPROVE FAST FROM 1 YEAR TO MORE YEAR
2,MAYO,ACL injury,A mild injury may stretch the ligament but lea...,IMPROVE,FAST,FROM 1 YEAR TO MORE YEAR,IMPROVE FAST FROM 1 YEAR TO MORE YEAR
3,MAYO,ACL injury,General rest is necessary for healing and limi...,IMPROVE,FAST,FROM 5 MINUTE TO 1 DAY,IMPROVE FAST FROM 5 MINUTE TO 1 DAY
4,MAYO,ACL injury,Recent research indicates that up to one-third...,IMPROVE,SLOWLY,FROM 1 YEAR TO MORE YEAR,IMPROVE SLOWLY FROM 1 YEAR TO MORE YEAR
5,MAYO,ACL injury,"In general, it takes as long as a year or more...",IMPROVE,SLOWLY,FROM 1 YEAR TO MORE YEAR,IMPROVE SLOWLY FROM 1 YEAR TO MORE YEAR
6,MAYO,ACL injury,There's no set time frame for athletes to retu...,PERMANENT,,,PERMANENT
8,MAYO,ARDS,This painless test tracks the electrical activ...,IMPROVE,FAST,FROM 5 MINUTE TO 1 DAY,IMPROVE FAST FROM 5 MINUTE TO 1 DAY
9,MAYO,ARDS,Many people with ARDS recover most of their lu...,IMPROVE,SLOWLY,FROM 1 YEAR TO MORE YEAR,IMPROVE SLOWLY FROM 1 YEAR TO MORE YEAR
7,MAYO,ARDS,Severe shortness of breath — the main symptom ...,IMPROVE,FAST,FROM 1 DAY TO 1 WEEK,IMPROVE FAST FROM 1 DAY TO 1 WEEK


In [82]:
result['reviewedTag'].value_counts()

IMPROVE FAST FROM 5 MINUTE TO 1 DAY            965
PERMANENT                                      868
IMPROVE FAST FROM 1 YEAR TO MORE YEAR          684
IMPROVE MODERATELY FROM 8 DAY TO 2 MONTH       618
IMPROVE MODERATELY FROM 2 MONTH TO 6 MONTH     389
IMPROVE FAST FROM 1 DAY TO 1 WEEK              276
DECLINE SLOWLY FROM 1 YEAR TO MORE YEAR        271
IMPROVE SLOWLY FROM 1 YEAR TO MORE YEAR        266
DECLINE FAST FROM 1 YEAR TO MORE YEAR          206
IMPROVE SLOWLY FROM 6 MONTH TO 1 YEAR          152
IMPROVE SLOWLY FROM 2 MONTH TO 6 MONTH         121
IMPROVE MODERATELY FROM 6 MONTH TO 1 YEAR       46
IMPROVE SLOWLY FROM 5 MINUTE TO 1 DAY           46
IMPROVE FAST FROM 6 MONTH TO 1 YEAR             33
DECLINE MODERATELY FROM 8 DAY TO 2 MONTH        31
DECLINE FAST FROM 5 MINUTE TO 1 DAY             29
IMPROVE MODERATELY FROM 1 DAY TO 1 WEEK         23
IMPROVE FAST FROM 8 DAY TO 2 MONTH              20
DECLINE FAST FROM 1 DAY TO 1 WEEK               18
IMPROVE MODERATELY FROM 1 YEAR 

In [83]:
groupclass = result.groupby(['reviewedTag'],dropna=False)
groupclass.head()

Unnamed: 0,source,conditionName,sentence,newTagL1,newTagL2,newTagL3,reviewedTag
0,MAYO,ACL injury,"When the ligament is damaged, there is usually...",DECLINE,FAST,FROM 1 YEAR TO MORE YEAR,DECLINE FAST FROM 1 YEAR TO MORE YEAR
1,MAYO,ACL injury,ACL injuries often happen during sports and fi...,IMPROVE,FAST,FROM 1 YEAR TO MORE YEAR,IMPROVE FAST FROM 1 YEAR TO MORE YEAR
2,MAYO,ACL injury,A mild injury may stretch the ligament but lea...,IMPROVE,FAST,FROM 1 YEAR TO MORE YEAR,IMPROVE FAST FROM 1 YEAR TO MORE YEAR
3,MAYO,ACL injury,General rest is necessary for healing and limi...,IMPROVE,FAST,FROM 5 MINUTE TO 1 DAY,IMPROVE FAST FROM 5 MINUTE TO 1 DAY
4,MAYO,ACL injury,Recent research indicates that up to one-third...,IMPROVE,SLOWLY,FROM 1 YEAR TO MORE YEAR,IMPROVE SLOWLY FROM 1 YEAR TO MORE YEAR
...,...,...,...,...,...,...,...
3491,NHS,Osteoporosis,"In osteonecrosis, the cells in the jaw bone di...",DECLINE,FAST,FROM 6 MONTH TO 1 YEAR,DECLINE FAST FROM 6 MONTH TO 1 YEAR
3749,NHS,Pneumonia,The symptoms of pneumonia can develop suddenly...,DECLINE,SLOWLY,FROM 1 DAY TO 1 WEEK,DECLINE SLOWLY FROM 1 DAY TO 1 WEEK
4045,NHS,Q fever,"Q fever is usually harmless, but in rare cases...",DECLINE,FAST,FROM 6 MONTH TO 1 YEAR,DECLINE FAST FROM 6 MONTH TO 1 YEAR
4670,NHS,Tetanus,"If it's not treated, the symptoms can get wors...",DECLINE,SLOWLY,FROM 1 DAY TO 1 WEEK,DECLINE SLOWLY FROM 1 DAY TO 1 WEEK


In [84]:
#number conditions per tag
result['conditionName'].value_counts()

Atrial fibrillation             29
Multiple sclerosis              25
Coronary artery bypass graft    23
Heart attack                    20
Knee ligament surgery           20
                                ..
Scurvy                           1
Eye injuries                     1
Dry mouth                        1
Hyperglycemia in diabetes        1
Swollen lymph nodes              1
Name: conditionName, Length: 1439, dtype: int64

In [85]:
group = result.groupby(['conditionName','reviewedTag'],dropna=False).count()
group.info()
group.head()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3635 entries, ('ACL injury', 'DECLINE FAST FROM 1 YEAR TO MORE YEAR') to ('Zollinger-Ellison syndrome', 'DECLINE FAST FROM 1 YEAR TO MORE YEAR')
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   source    3635 non-null   int64
 1   sentence  3635 non-null   int64
 2   newTagL1  3635 non-null   int64
 3   newTagL2  3635 non-null   int64
 4   newTagL3  3635 non-null   int64
dtypes: int64(5)
memory usage: 164.3+ KB


Unnamed: 0_level_0,Unnamed: 1_level_0,source,sentence,newTagL1,newTagL2,newTagL3
conditionName,reviewedTag,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ACL injury,DECLINE FAST FROM 1 YEAR TO MORE YEAR,1,1,1,1,1
ACL injury,IMPROVE FAST FROM 1 YEAR TO MORE YEAR,2,2,2,2,2
ACL injury,IMPROVE FAST FROM 5 MINUTE TO 1 DAY,1,1,1,1,1
ACL injury,IMPROVE SLOWLY FROM 1 YEAR TO MORE YEAR,2,2,2,2,2
ACL injury,PERMANENT,1,1,1,0,0


In [86]:
group.to_csv(str(date.today())+'_GroupedPredicted.csv')
# group.to_csv('2021-06-06_GroupedPredicted.csv')

In [87]:
result = pd.read_csv('2021-06-06_GroupedPredicted.csv')
result.sort_values(by=['source', 'conditionName'],inplace=True,ascending=[True,True])
result.info()
result.tail()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3635 entries, 0 to 278
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   conditionName  3635 non-null   object
 1   reviewedTag    3635 non-null   object
 2   source         3635 non-null   int64 
 3   sentence       3635 non-null   int64 
 4   newTagL1       3635 non-null   int64 
 5   newTagL2       3635 non-null   int64 
 6   newTagL3       3635 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 227.2+ KB


Unnamed: 0,conditionName,reviewedTag,source,sentence,newTagL1,newTagL2,newTagL3
1925,Knee ligament surgery,IMPROVE MODERATELY FROM 8 DAY TO 2 MONTH,9,9,9,9,9
2270,Multiple sclerosis,PERMANENT,9,9,9,0,0
274,Atrial fibrillation,IMPROVE FAST FROM 5 MINUTE TO 1 DAY,10,10,10,10,10
1575,Heart failure,PERMANENT,11,11,11,0,0
278,Atrial fibrillation,PERMANENT,13,13,13,0,0
