# 3.3.1. TF-IDF

In [11]:
# load data 
import pandas as pd
df = pd.read_csv('clinic.csv')
df.head()

Unnamed: 0,Sent1,Sent2,Score
0,Insulin NPH Human [NOVOLIN N] 100 unit / mL su...,Insulin NPH Human [NOVOLIN N] 100 unit / mL su...,3.5
1,"Patient arrives ambulatory, Gait steady, Hist...","Complex assessment performed, Patient arrives...",2.5
2,"Peripheral IV site, established in the right f...","Peripheral IV site, present prior to arrival, ...",3.45
3,No : new confusion or inability to stay aler...,No : new confusion or inability to stay aler...,4.0
4,Spent 15 minutes with the patient and greater ...,"Nurse visit ten minutes, over half of which w...",3.0


In [12]:
# check the size of the data 
sent1 = df['Sent1'].tolist()
sent2 = df['Sent2'].tolist()
lables = df['Score'].tolist()


In [13]:
#retrieve stored values
%store -r cleaned1
%store -r cleaned2
%store -r dtree1
%store -r dtree2

# data split.
train_text1 = dtree1[:750]
train_text2 = dtree2[:750]
train_labels =lables[:750]

test_text1 = dtree1[750:]
test_text2 = dtree2[750:]
test_labels =lables[750:]


In [14]:
train_text1[0]

'insulin nph human novolin unit suspension subcutaneous directed prescriber'

In [15]:
# training: tf-idf + Random Forest

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer1 = TfidfVectorizer()

sent1_vec = vectorizer1.fit_transform(dtree1)
sent2_vec = vectorizer1.fit_transform(dtree2)


train_vecs1 = vectorizer1.transform(train_text1)
train_vecs2 = vectorizer1.transform(train_text2)

test_vecs1 = TfidfVectorizer(vocabulary=vectorizer1.vocabulary_).fit_transform(test_text1)
test_vecs2 = TfidfVectorizer(vocabulary=vectorizer1.vocabulary_).fit_transform(test_text2)
feat = vectorizer1.get_feature_names()
print(feat)
%store feat


['abdominal', 'ability', 'able', 'abnormal', 'abrupt', 'abstinent', 'abuse', 'acceptance', 'accompanied', 'according', 'ace', 'acellular', 'acetonide', 'ache', 'acog', 'acquired', 'act', 'active', 'actively', 'activity', 'actuation', 'acupuncture', 'addicition', 'addiction', 'addition', 'additional', 'address', 'adequate', 'adhd', 'adhesive', 'adjunct', 'administer', 'administered', 'administration', 'admission', 'adsorbed', 'adult', 'advance', 'advanced', 'advancement', 'advised', 'aerolizer', 'aerosol', 'affected', 'african', 'age', 'agitation', 'ago', 'agree', 'agreed', 'agreement', 'agrees', 'ahead', 'aid', 'aircast', 'airway', 'albuterol', 'alcohol', 'alert', 'alfa', 'allergic', 'allergy', 'altering', 'alternative', 'ambulate', 'ambulating', 'ambulatory', 'american', 'amount', 'analysis', 'androgel', 'anemia', 'anesthesia', 'aneurysm', 'anger', 'ankle', 'anoxic', 'answer', 'answered', 'antecubital', 'anterior', 'anthrax', 'antibiotic', 'anticoagulation', 'antigen', 'antiplatelet',

In [16]:
# train model
from sklearn.ensemble import RandomForestRegressor
reg1 = RandomForestRegressor(max_depth=6).fit(train_vecs1, train_labels)
reg2 = RandomForestRegressor(max_depth=6).fit(train_vecs2, train_labels)
reg1

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=6, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [17]:
# test model

from sklearn.metrics import mean_squared_error
test_pred1 = reg1.predict(test_vecs1)
test_pred2 = reg2.predict(test_vecs2)

a_mse1 = mean_squared_error(test_labels, test_pred1)
a_mse2 = mean_squared_error(test_labels, test_pred2)

print('MSE for Sentence  1: ', a_mse1)

print("--------------------------------------------")

print('MSE for Sentence  2: ', a_mse2)


MSE for Sentence  1:  1.1771395289616289
--------------------------------------------
MSE for Sentence  2:  1.1620100266864666


#### Similarity Measures - Cosine

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

a_cc=[]
for i in range(0,750):
    c= cosine_similarity(train_vecs1[i].reshape(1, -1), train_vecs2[i].reshape(1, -1))[0][0]
    a_cc.append(c)
print(a_cc)
len(a_cc)

a_cc1=[]
for i in range(0,318):
    c1= cosine_similarity(test_vecs1[i].reshape(1, -1), test_vecs2[i].reshape(1, -1))[0][0]
    a_cc1.append(c1)
print(a_cc1)
len(a_cc1)

[0.9672555552521551, 0.5769009002492103, 0.6167820213944601, 0.8007495847105559, 0.42952220136666713, 0.27317960965515303, 0.47275742478210925, 0.5743667326801359, 0.0, 0.3952299614713378, 0.8324866360967071, 0.021191442396272235, 0.39405312322474423, 0.6982533219471078, 0.6832819738784746, 0.8547363500068171, 0.45638352457484566, 0.8955286170671726, 0.8350557755423611, 0.7375021721377085, 0.879228294508573, 0.8374276311820041, 0.7270055417036316, 0.6467201610585418, 0.9135973035145287, 0.7154891716315852, 0.09289784311358679, 0.7902365498340317, 0.9286673183556805, 0.9458813553272893, 0.9533106598192781, 1.0000000000000002, 0.5309715943365698, 0.5704611795196461, 0.16301876030693688, 0.7195566218004503, 0.0, 0.8123188320310115, 0.49152314595602076, 0.4704617101604808, 0.8103841838967383, 0.0, 0.5840097574450649, 0.3105789803979271, 0.008850523339721271, 0.4092827281441523, 0.39645531912871756, 0.6416523151843416, 0.6101942848539215, 0.8921458868645213, 0.4457398473967184, 0.0053775617

[0.41955002807978437, 0.42864695991955504, 0.6593588903909994, 0.5757684957110678, 0.4469676306596063, 0.9443478272356218, 0.6059305621954767, 0.6270716381262907, 0.5336417302822222, 0.9774703219599673, 0.016654212887699915, 0.30283655132917053, 0.903700271873311, 0.8117255708783296, 0.08122960001332108, 0.8660491305012777, 0.845936626885618, 0.719545871841021, 0.5401374898393766, 0.22668790099884462, 0.5689229792463723, 0.19427417438499445, 0.8364249907104412, 0.5238918802976268, 0.47902339901719426, 0.49354257266557094, 0.6016746716219257, 0.5143992075453849, 0.2612487983273119, 0.5372498379469135, 0.933185577551074, 0.8196442395431953, 0.9045669421549887, 0.7732551281231378, 0.6614122712432612, 0.8544309822172619, 0.1306307537987535, 0.9335902300556921, 0.7873103296232846, 0.827084036249666, 0.6752339514122113, 0.7851252251801072, 0.46483268809451045, 0.8722268577590564, 0.961137240258262, 0.8194606127351742, 0.07865467405608227, 0.25622595213214877, 0.958731964245483, 0.75059803195

318

##### Pearson Coefficient

In [22]:
# pearson Correlation 

import numpy as np
from scipy.stats import pearsonr
a_pc, _ = pearsonr(a_cc, train_labels)
print('Pearsons correlation for Train: %.5f' % a_pc)

Pearsons correlation for Train: 0.72501


In [23]:
import numpy as np
from scipy.stats import pearsonr
a_pc_t, _ = pearsonr(a_cc1, test_labels)
print('Pearsons correlation for Test: %.5f' % a_pc_t)

Pearsons correlation for Test: 0.70963


In [24]:
#storing variables
%store a_mse1
%store a_mse2
%store a_cc
%store a_cc1
%store a_pc
%store a_pc_t

Stored 'a_mse1' (float64)
Stored 'a_mse2' (float64)
Stored 'a_cc' (list)
Stored 'a_cc1' (list)
Stored 'a_pc' (float64)
Stored 'a_pc_t' (float64)
