In [1]:
import string
import pandas as pd

# Data Cleaning
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize 

# Model
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Loading data

data = pd.read_csv('data', sep=",", header=None)
data.columns = ['text']
data

Unnamed: 0,text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...
...,...
1194,From: jerryb@eskimo.com (Jerry Kaufman)\nSubje...
1195,From: golchowy@alchemy.chem.utoronto.ca (Geral...
1196,From: jayne@mmalt.guild.org (Jayne Kulikauskas...
1197,From: sclark@epas.utoronto.ca (Susan Clark)\nS...


In [3]:
def clean (text):
    
    # Remove punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ')

    # Lower case
    lowercased = text.lower()
    
    # Tokenize
    tokenized = word_tokenize(lowercased)
    
    # Remove numbers
    words_only = [word for word in tokenized if word.isalpha()]
    
    # Stop words
    stop_words = set(stopwords.words('english'))
    without_stopwords = [word for word in words_only if not word in stop_words]
    
    # Lemmatize
    lemma=WordNetLemmatizer()
    lemmatized = [lemma.lemmatize(word) for word in without_stopwords]
    
    return lemmatized

In [4]:
data['clean_text'] = data.text.apply(clean)
data['clean_text'] = data['clean_text'].astype('str')

data

Unnamed: 0,text,clean_text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...,"['gld', 'cunixb', 'cc', 'columbia', 'edu', 'ga..."
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...,"['atterlep', 'vela', 'ac', 'oakland', 'edu', '..."
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...,"['miner', 'kuhub', 'cc', 'ukans', 'edu', 'subj..."
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...,"['atterlep', 'vela', 'ac', 'oakland', 'edu', '..."
4,From: vzhivov@superior.carleton.ca (Vladimir Z...,"['vzhivov', 'superior', 'carleton', 'ca', 'vla..."
...,...,...
1194,From: jerryb@eskimo.com (Jerry Kaufman)\nSubje...,"['jerryb', 'eskimo', 'com', 'jerry', 'kaufman'..."
1195,From: golchowy@alchemy.chem.utoronto.ca (Geral...,"['golchowy', 'alchemy', 'chem', 'utoronto', 'c..."
1196,From: jayne@mmalt.guild.org (Jayne Kulikauskas...,"['jayne', 'mmalt', 'guild', 'org', 'jayne', 'k..."
1197,From: sclark@epas.utoronto.ca (Susan Clark)\nS...,"['sclark', 'epa', 'utoronto', 'ca', 'susan', '..."


In [5]:
# Unsupervised Learning in NLP: Latent Dirichlet Allocation model
vectorizer = CountVectorizer()

# Fit and transform in the model
data_vectorized = vectorizer.fit_transform(data['clean_text'])
lda_model = LatentDirichletAllocation(n_components=2)
lda_vectors = lda_model.fit_transform(data_vectorized)

In [None]:
def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])

In [None]:
print_topics(lda_model, vectorizer)