In [46]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup

In [47]:
issues = pd.read_json('cleaned_issues.json')

In [48]:
issues.head()

Unnamed: 0,repo_name,title,body,languages
0,hyper,Allow overriding the default shell on Windows,**Is your feature request related to a problem...,"[TypeScript, Shell, NSIS]"
1,hyper,Support forward slashes on Windows,**Is your feature request related to a problem...,"[TypeScript, Shell, NSIS]"
2,hyper,Support home-relative paths by expanding `~`,**Is your feature request related to a problem...,"[TypeScript, Shell, NSIS]"
3,vercel,`vercel link --repo` very slow,I used `0x` to trace it and its a glob issue.\...,"[TypeScript, JavaScript, HTML]"
4,vercel,[cli] Add telemetry for `vercel teams invite e...,Additional telemetry (and minimal test setup t...,"[TypeScript, JavaScript, HTML]"


In [49]:
issues['tags'] = issues['title'] + ' ' + issues['body'] + ' ' + str(issues['languages'])

In [50]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\W', ' ', text)
    text = BeautifulSoup(text, "html.parser").get_text() 
    return text

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

import spacy
nlp = spacy.load("en_core_web_sm")

def lemmatize_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop])

[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:
issues['tags'] = issues['tags'].apply(clean_text)
issues['tags'] = issues['tags'].apply(remove_stopwords)
issues['tags'] = issues['tags'].apply(lemmatize_text)

In [52]:
issues.head()

Unnamed: 0,repo_name,title,body,languages,tags
0,hyper,Allow overriding the default shell on Windows,**Is your feature request related to a problem...,"[TypeScript, Shell, NSIS]",allow overriding default shell window feature ...
1,hyper,Support forward slashes on Windows,**Is your feature request related to a problem...,"[TypeScript, Shell, NSIS]",support forward slash window feature request r...
2,hyper,Support home-relative paths by expanding `~`,**Is your feature request related to a problem...,"[TypeScript, Shell, NSIS]",support homerelative path expand feature reque...
3,vercel,`vercel link --repo` very slow,I used `0x` to trace it and its a glob issue.\...,"[TypeScript, JavaScript, HTML]",vercel link repo slow x trace glob issue issue...
4,vercel,[cli] Add telemetry for `vercel teams invite e...,Additional telemetry (and minimal test setup t...,"[TypeScript, JavaScript, HTML]",cli add telemetry vercel team invite email ema...


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(issues['tags'])

In [54]:
tfidf_matrix.shape

(21, 927)

In [55]:
issues.to_json('preprocessed.json')