In [None]:
import re
from typing import List

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

csvfile = '../Semantic-Analysis-Regulations/regulations_data_analysis.csv'

with open(csvfile, 'r') as f:
    df = pd.read_csv(f)
    long_titles = df['long_title']

nltk.download('punkt')
nltk.download('stopwords')

In [10]:
def tokenize(title: str):
    # Tokenize
    tokenized_title: List[str] = nltk.word_tokenize(title)
    
    # Remove unwanted words (numbers and stopwords)
    stops = stopwords.words('english')
    # TODO: Do we need to filter out numbers?
    tokenized_title = [token for token in tokenized_title
                       if re.match(r'.*[A-Za-z].*', token) 
                       and token not in stops]

    # Stemming
    # https://www.nltk.org/api/nltk.stem.html#module-nltk.stem.porter
    stemmer = PorterStemmer()
    return [stemmer.stem(token) for token in tokenized_title]

In [8]:
vectorizer = TfidfVectorizer(tokenizer=tokenize) # TODO: min_df and max_df could be set to filter out some words
tf_idf = vectorizer.fit_transform(long_titles)
print(tf_idf.toarray())
print(vectorizer.get_feature_names())