In [1]:
import pandas as pd
import nltk
import string

# Download NLTK resources (only needed once)
nltk.download('stopwords')
nltk.download('punkt')

In [105]:
df = pd.read_csv('course_info.tsv', sep='\t')
df.head(2)

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,country,administration,url
0,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,Full time,3D visualisation and animation play a role in ...,September,Please see the university website for further ...,MSc,1 year full-time,Glasgow,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
1,Accounting and Finance - MSc,University of Leeds,Leeds University Business School,Full time,Businesses and governments rely on sound finan...,September,"UK: £18,000 (Total)International: £34,750 (Total)",MSc,1 year full time,Leeds,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...


In [86]:
df.iloc[-2]['fees']

'Band 2UK fees:Please see the university website for further information on fees for this course.International fees:Please see the university website for further information on fees for this course.'

## 2.0 Preprocessing

In [104]:
def preprocess_text(description):
    # Convert to lowercase
    description = description.lower()
    
    # Remove punctuation
    for char in string.punctuation:
        description = description.replace(char, '')
    
    # Tokenize
    tokens = nltk.word_tokenize(description)

    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = nltk.PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Join tokens back into a string
    processed_text = ' '.join(tokens)

    return processed_text

df['description'] = df.description.apply(preprocess_text)
df.head()

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,country,administration,url
0,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,Full time,3d visualis anim play role mani area popular m...,September,Please see the university website for further ...,MSc,1 year full-time,Glasgow,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
1,Accounting and Finance - MSc,University of Leeds,Leeds University Business School,Full time,busi govern reli sound financi knowledg underp...,September,"UK: £18,000 (Total)International: £34,750 (Total)",MSc,1 year full time,Leeds,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
2,"Accounting, Accountability & Financial Managem...",King’s College London,King’s Business School,Full time,account account financi manag msc cours provid...,September,Please see the university website for further ...,MSc,1 year FT,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
3,"Accounting, Financial Management and Digital B...",University of Reading,Henley Business School,Full time,embark profession account career academ ground...,September,Please see the university website for further ...,MSc,1 year full time,Reading,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
4,Addictions MSc,King’s College London,"Institute of Psychiatry, Psychology and Neuros...",Full time,join us onlin session prospect student find ms...,September,Please see the university website for further ...,MSc,One year FT,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...


### 2.0.1) Preprocessing the fees column

In [90]:
import re
from forex_python.converter import CurrencyRates

df = pd.read_csv('course_info.tsv', sep='\t')
def preprocess_fees(fees, target_currency='USD'):
    if isinstance(fees, str):
        # Extract numeric values from the fees using regex
        fees = fees.replace(',', '').replace('.', '')
        fees_values = [int(match.group()) for match in re.finditer(r'[0-9]+', fees) if int(match.group()) != 2023 and int(match.group()) > 100]

        if fees_values:
            # Convert to the target currency
            c = CurrencyRates()
            converted_fees = [c.convert('USD', target_currency, value) for value in fees_values]

            # Return the highest fee
            highest_fee = max(converted_fees, default=None)
            return highest_fee
        else:
            # Return None for non-numeric strings
            return None
    else:
        # Return None for non-string input
        return None

# Apply the function to the 'fees' column
df['fees'] = df['fees'].apply(preprocess_fees)
df.head()

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,country,administration,url
0,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,Full time,3D visualisation and animation play a role in ...,September,,MSc,1 year full-time,Glasgow,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
1,Accounting and Finance - MSc,University of Leeds,Leeds University Business School,Full time,Businesses and governments rely on sound finan...,September,34750.0,MSc,1 year full time,Leeds,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
2,"Accounting, Accountability & Financial Managem...",King’s College London,King’s Business School,Full time,"Our Accounting, Accountability & Financial Man...",September,,MSc,1 year FT,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
3,"Accounting, Financial Management and Digital B...",University of Reading,Henley Business School,Full time,Embark on a professional accounting career wit...,September,,MSc,1 year full time,Reading,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
4,Addictions MSc,King’s College London,"Institute of Psychiatry, Psychology and Neuros...",Full time,Join us for an online session for prospective ...,September,,MSc,One year FT,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...


In [116]:
def get_tokens(description):
    # Convert to lowercase
    description = description.lower()
    description = description.replace('-', ' ')
    
    # Remove punctuation
    for char in string.punctuation:
        description = description.replace(char, '')
    
    # Tokenize
    tokens = nltk.word_tokenize(description)

    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = nltk.PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    
    return tokens

def create_vocabulary(tokens):
    voc = {}
    i = 0
    for token in tokens:
        for t in token:
            if t not in voc:
                voc[t] = i
                i += 1
    return voc 

tk = df['description'].apply(get_tokens)
vocabulary = create_vocabulary(tk)

In [117]:
import pickle
with open('vocabulary.pkl', 'wb') as f:
    pickle.dump(vocabulary, f)
    
#with open('vocabulary.pkl', 'rb') as f:
#    d = pickle.load(f)
#    
#print(d)

In [150]:
vocabulary

{'3d': 0,
 'visualis': 1,
 'anim': 2,
 'play': 3,
 'role': 4,
 'mani': 5,
 'area': 6,
 'popular': 7,
 'media': 8,
 'keep': 9,
 'grow': 10,
 'digit': 11,
 'provid': 12,
 'eye': 13,
 'catch': 14,
 'special': 15,
 'effect': 16,
 '21st': 17,
 'centuri': 18,
 'favourit': 19,
 'film': 20,
 'televis': 21,
 'show': 22,
 'design': 23,
 'also': 24,
 'essenti': 25,
 'everyday': 26,
 'work': 27,
 'everyth': 28,
 'comput': 29,
 'game': 30,
 'develop': 31,
 'onlin': 32,
 'virtual': 33,
 'world': 34,
 'industri': 35,
 'market': 36,
 'product': 37,
 'architecturegcu': 38,
 'programm': 39,
 'environ': 40,
 'help': 41,
 'skill': 42,
 'thrive': 43,
 'success': 44,
 'career': 45,
 'visual': 46,
 'practic': 47,
 'focus': 48,
 'orient': 49,
 'toward': 50,
 'current': 51,
 'need': 52,
 'technolog': 53,
 'prior': 54,
 'knowledg': 55,
 'requir': 56,
 'busi': 57,
 'govern': 58,
 'reli': 59,
 'sound': 60,
 'financi': 61,
 'underpin': 62,
 'strategi': 63,
 'successthi': 64,
 'cours': 65,
 'kick': 66,
 'start': 67

In [146]:
tk

0       [3d, visualis, anim, play, role, mani, area, p...
1       [busi, govern, reli, sound, financi, knowledg,...
2       [account, account, financi, manag, msc, cours,...
3       [embark, profession, account, career, academ, ...
4       [join, us, onlin, session, prospect, student, ...
                              ...                        
5970    [master, biolog, two, year, msc, programm, spe...
5971    [regist, interest, graduat, studi, uclthi, pro...
5972    [biomed, molecular, scienc, research, msc, mre...
5973    [biomed, analyt, scienc, msc, cours, enabl, de...
5974    [msc, programm, design, produc, highli, qualif...
Name: description, Length: 5975, dtype: object

In [155]:
def create_inverted_indx(vocabulary, tokens):
    inverted_index = {term_id: set() for term_id in vocabulary.values()}
    
    for i, l in enumerate(tokens):
        for token in l:
            term_id = vocabulary.get(token)
            if term_id is not None:
                inverted_index[term_id].add(i)
                
    return inverted_index
    
inverted_indx = create_inverted_indx(vocabulary, tk)

In [159]:
inverted_indx[10]


{0,
 36,
 39,
 40,
 47,
 49,
 50,
 51,
 53,
 110,
 111,
 117,
 125,
 136,
 142,
 173,
 175,
 176,
 181,
 193,
 208,
 222,
 225,
 230,
 233,
 236,
 240,
 272,
 291,
 295,
 445,
 528,
 689,
 740,
 761,
 789,
 832,
 838,
 847,
 851,
 877,
 881,
 884,
 935,
 960,
 965,
 966,
 974,
 985,
 989,
 993,
 1019,
 1020,
 1109,
 1117,
 1134,
 1138,
 1142,
 1143,
 1144,
 1145,
 1146,
 1147,
 1148,
 1155,
 1163,
 1165,
 1166,
 1167,
 1169,
 1172,
 1192,
 1193,
 1194,
 1200,
 1203,
 1207,
 1209,
 1217,
 1218,
 1246,
 1250,
 1257,
 1272,
 1286,
 1309,
 1314,
 1315,
 1318,
 1319,
 1325,
 1328,
 1332,
 1336,
 1338,
 1339,
 1355,
 1392,
 1399,
 1489,
 1490,
 1494,
 1498,
 1505,
 1507,
 1513,
 1527,
 1533,
 1537,
 1539,
 1549,
 1559,
 1563,
 1571,
 1573,
 1575,
 1577,
 1584,
 1652,
 1654,
 1755,
 1843,
 1844,
 1890,
 1936,
 1953,
 1998,
 2006,
 2041,
 2071,
 2082,
 2085,
 2092,
 2096,
 2106,
 2139,
 2142,
 2179,
 2188,
 2205,
 2206,
 2245,
 2248,
 2249,
 2252,
 2254,
 2271,
 2279,
 2326,
 2339,
 2405,
 248