# NLP Tricks

## Cleaning Text

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

#set up the word cleaning
porter=PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

nltk.download('stopwords')
stop = set(stopwords.words('english'))


exclude = set(string.punctuation)

#Create a single function to preprocess
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(wordnet_lemmatizer.lemmatize(word) for word in punc_free.split())
    return normalized

## Simple vectorizer to find top words for topics

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

fname = 'some_file'
docs = pd.read_csv(fname)

print(docs.head())
print(docs.shape)

tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

dtm = tfidf.fit_transform(docs['document'])

nmf_model = NMF(n_components=len(docs),random_state=42)

nmf_model.fit(dtm)

for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 10 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')