In [78]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
###### helper functions. Use them when needed #######
def get_title_from_index(index):
	return df[df.index == index]["Medicine"].values[0]

def get_index_from_title(Medicine):
	return df[df.Medicine == Medicine]["index"].values[0]
##################################################

##Step 1: Read CSV File
df = pd.read_csv("pet.csv")

print (df.columns)

Index(['index', 'Name', 'DateTime', 'MonthYear', 'Date of Birth',
       'Outcome Type', 'Outcome Subtype', 'animal_Type', 'Sex_upon_Outcome',
       'Age_upon_Outcome', 'Breed', 'Color', 'Medicine'],
      dtype='object')


In [79]:
##Step 2: Select Features
features = ['animal_Type', 'Sex_upon_Outcome', 'Age_upon_Outcome', 'Breed', 'Medicine']

In [80]:
##Step 3: Create a column in DF which combines all selected features
for feature in features:
    df[feature] = df[feature].fillna('')


def combine_features(row):
    try:
        return row['animal_Type']+" "+row['Sex_upon_Outcome']+" "+row['Age_upon_Outcome']+" "+row['Breed']+ " "+row['Medicine']
    except:
        print ("Error:",row)

df["combine_features"] = df.apply(combine_features, axis=1)

print ( "Combined features:", df["combine_features"].head())

Combined features: 0         Dog Neutered Male 1 year Pit Bull Mix Oxygen
1    Cat Neutered Male 5 months Domestic Shorthair ...
2    Cat Spayed Female 10 months Domestic Shorthair...
3    Dog Neutered Male 5 years Border Terrier Mix y...
4         Other Unknown 2 years Raccoon Mix tolazoline
Name: combine_features, dtype: object


In [81]:
##Step 4: Create count matrix from this new combined column

cv = CountVectorizer()


count_matrix = cv.fit_transform(df["combine_features"])

In [88]:
##Step 5: Compute the Cosine Similarity based on the count_matrix
cosine_sim = cosine_similarity(count_matrix)

most_used_medicine = "Oxygen"


In [89]:
## Step 6: Get index of this pet from its medicine
pet_index = int(get_index_from_title(most_used_medicine))

similar_med = list(enumerate(cosine_sim[pet_index]))



In [90]:
## Step 7: Get a list of similar meds in descending order of similarity score
sorted_similar_meds = sorted(similar_med , key=lambda x:x[1],reverse=True)

In [92]:
## Step 8: Print titles of first 50 medicinies
i=0
for meds in sorted_similar_meds:
    print (get_title_from_index(meds[0]))
    i=i+1
    if i>50:
        break

Oxygen
Lidocaine
Ampicillin
Heparin
inflammatory
Oxygen
mannitol
anaesthetic
Dextrose
Oxygen
inflammatory
alfaxalone
Adrenaline
hydroxyzine
Oxygen
inflammatory
Heparin
isoflurane
Amoxicillin
Ampicillin
Cefazolin
sulfonamide
Oxygen
Clindamycin
inflammatory
mannitol
Adrenaline
Enrofloxacin
mannitol
Cefazolin
sulfonamide
hydroxyzine
isoflurane
Vitamin K14
Potassium chloride
Potassium chloride
Oxygen
Opioids
Oxygen
hydroxyzine
isoflurane
Adrenaline
xylazine
Anticholinergic
Opioids
Crystalloid
yohimbine
Florfenicol
anaesthetic
Amoxicillin
sulfonamide
