# COSINE SIMILARITY

In [3]:
#import libraries

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [4]:
#load data

UK_UNI = pd.read_csv('Populated UK UNI.csv')

In [5]:
UK_UNI

Unnamed: 0,University_name,Course_Name,Region,Founded_year,UK_rank,World_rank,Minimum_IELTS_score,PG_average_fees_(in_pounds),International_students,Student_satisfaction,Student_enrollment,Academic_staff,Control_type,Academic_Calender,Campus_setting,Estimated_cost_of_living_per_year_(in_pounds)
0,University of Cambridge,Business & Management Studies,East of England,1209,1,4,6.5,34920,20.20%,85.50%,"20,000-24,999","over-5,000",Public,Trimesters,Urban,12000
1,University of Cambridge,Engineering and Technology,East of England,1209,1,4,6.5,35494,20.20%,85.50%,"20,000-24,999","over-5,000",Public,Trimesters,Urban,12000
2,University of Cambridge,Computing,East of England,1209,1,4,6.5,27000,20.20%,85.50%,"20,000-24,999","over-5,000",Public,Trimesters,Urban,12000
3,University of Cambridge,Social Sciences,East of England,1209,1,4,6.5,35525,20.20%,85.50%,"20,000-24,999","over-5,000",Public,Trimesters,Urban,12000
4,University of Oxford,Business & Management Studies,South East England,1096,2,2,6.5,43600,16.80%,86.50%,"25,000-29,999","over-5,000",Public,Trimesters,Urban,11500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
519,Wrexham Glyndwr University,Social Sciences,Wales,2008,130,2397,4.5,12000,23.00%,74.30%,"5,000-5,999",200-299,Public,Trimesters,Urban,7771
520,Ravensbourne University London,Business & Management Studies,London,1962,131,2759,5.5,0,15.59%,66.10%,"2,000-2,999",100-199,Public,Semesters,Urban,10229
521,Ravensbourne University London,Engineering and Technology,London,1962,131,2759,5.5,17000,15.59%,66.10%,"2,000-2,999",100-199,Public,Semesters,Urban,10229
522,Ravensbourne University London,Computing,London,1962,131,2759,5.5,0,15.59%,66.10%,"2,000-2,999",100-199,Public,Semesters,Urban,10229


In [15]:
#select features to use for similarity

features = ['University_name', 'Course_Name', 'Region', 'PG_average_fees_(in_pounds)']

In [22]:
# Normalize fees
scaler = MinMaxScaler()
UK_UNI['PG_average_fees_(in_pounds)'] = scaler.fit_transform(UK_UNI['PG_average_fees_(in_pounds)'].values.reshape(-1,1)) 

# Convert normalized fees back to string
UK_UNI['PG_average_fees_(in_pounds)'] = UK_UNI['PG_average_fees_(in_pounds)'].astype(str)


In [None]:
# Create TfidfVectorizer object
vectorizer = TfidfVectorizer()

In [23]:
#Generate vectors

School_vectors = vectorizer.fit_transform(UK_UNI[features].apply(lambda x: ''.join(x), axis=1))

In [24]:
#Calculate cosine similarity

similarity = cosine_similarity(School_vectors)

In [28]:
#define a function to get recommendations

def get_recommendations(name, topN):
 
 #Get index of school
    idx = UK_UNI[UK_UNI['University_name'] == name].index[0]  
    
 #Initialize constraints
    same_uni_count = 0
    uni_list = []
    
  #sort by similarity
    scores = enumerate(similarity[idx]) 
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse = True)  
   
  # Create empty list for top matches
    top_matches = []
    
  # Iterate through scores

    for i, score in sorted_scores:
    # Apply diversity constraints
        if UK_UNI.loc[i, 'University_name'] not in uni_list:
            top_matches.append((i,score))
            uni_list.append(UK_UNI.loc[i, 'University_name'])
            
    # Break loop once we have top N     
            
        if len(top_matches) >= topN:
            break
        
        
        
    #top_schools = sorted_scores[1:topN+1]   #Get topN schools
    
   #return top N recommendations
    return UK_UNI.iloc[ [i[0] for i in top_matches] ]  
    
#Get 3 recommendations for Sheffield Hallam University
print(get_recommendations('Sheffield Hallam University',))
    

                 University_name                    Course_Name  \
260  Sheffield Hallam University  Business & Management Studies   
488     Leeds Beckett University  Business & Management Studies   
416       University of Bradford  Business & Management Studies   
320      York St John University  Business & Management Studies   
240   University of Huddersfield  Business & Management Studies   

                       Region  Founded_year  UK_rank  World_rank  \
260  Yorkshire and the Humber          1992       65         576   
488  Yorkshire and the Humber          1992      123         657   
416  Yorkshire and the Humber          1966      105        1392   
320  Yorkshire and the Humber          1841       81        2643   
240  Yorkshire and the Humber          1992       61        1111   

     Minimum_IELTS_score PG_average_fees_(in_pounds) International_students  \
260                  4.5         0.09345794392523366                  5.90%   
488                  4.5      