<a href="https://colab.research.google.com/github/carlakim/APS-4-DPCD/blob/master/IDS2014Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups # https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [None]:
#dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
dataset = fetch_20newsgroups(random_state=1)

documents = dataset.data
targets = dataset.target
print('#samples :',len(documents))
print('#samples :',len(targets))

#samples : 11314
#samples : 11314


In [None]:
target_df = pd.DataFrame({'target': targets})
print(dataset.target_names)
target_df.head()

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


Unnamed: 0,target
0,17
1,0
2,17
3,11
4,10


In [None]:
news_df = pd.DataFrame({'document':documents})
# special character removal
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# short word removal
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# lowercase
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
news_df.head()

Unnamed: 0,document,clean_doc
0,"From: ab4z@Virginia.EDU (""Andi Beyer"")\nSubjec...",from virginia andi beyer subject israeli terro...
1,From: timmbake@mcl.ucsb.edu (Bake Timmons)\nSu...,from timmbake ucsb bake timmons subject amusin...
2,From: bc744@cleveland.Freenet.Edu (Mark Ira Ka...,from cleveland freenet mark kaufman subject re...
3,From: ray@ole.cdac.com (Ray Berry)\nSubject: C...,from cdac berry subject clipper business usual...
4,From: kkeller@mail.sas.upenn.edu (Keith Keller...,from kkeller mail upenn keith keller subject p...


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # tokenization
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
print(tokenized_doc[1])


['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']


In [None]:
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

news_df['clean_doc'] = detokenized_doc

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000, # keep top 1000 words
max_df = 0.5, smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])

print('TF-IDF size :',X.shape)

TF-IDF size : (11314, 1000)


In [None]:
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122) # set the # components
svd_model.fit(X)
len(svd_model.components_)

20

In [None]:
import numpy as np

In [None]:
np.shape(svd_model.components_)

(20, 1000)

In [None]:
terms = vectorizer.get_feature_names()  

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(svd_model.components_,terms)

Topic 1: [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128)]
Topic 2: [('thanks', 0.32888), ('windows', 0.29088), ('card', 0.18069), ('drive', 0.17455), ('mail', 0.15111)]
Topic 3: [('game', 0.37064), ('team', 0.32443), ('year', 0.28154), ('games', 0.2537), ('season', 0.18419)]
Topic 4: [('drive', 0.53324), ('scsi', 0.20165), ('hard', 0.15628), ('disk', 0.15578), ('card', 0.13994)]
Topic 5: [('windows', 0.40399), ('file', 0.25436), ('window', 0.18044), ('files', 0.16078), ('program', 0.13894)]
Topic 6: [('chip', 0.16114), ('government', 0.16009), ('mail', 0.15625), ('space', 0.1507), ('information', 0.13562)]
Topic 7: [('like', 0.67086), ('bike', 0.14236), ('chip', 0.11169), ('know', 0.11139), ('sounds', 0.10371)]
Topic 8: [('card', 0.46633), ('video', 0.22137), ('sale', 0.21266), ('monitor', 0.15463), ('offer', 0.14643)]
Topic 9: [('know', 0.46047), ('card', 0.33605), ('chip', 0.17558), ('government', 0.1522), ('video', 0.14356)]
Topic 10



For each following case, 


*   Identify important features (5)
*   Draw AUC-ROC curve (5)
*   Interpret results (5)


Cases:
1. Use logistic regression to regress news categories on (latent) components
2. Use logistic regression to regress senders' affiliation on (latent) components
* academia vs. industry (you can identify it by email address domain name)
3. Use logistic regression to regress receivers' affiliation on (latent) components
4. Use decision tree to do the same job as cases 1,2,3 (draw decision trees) (extra 10)
5. Change the #top words to include and #components; repeat 1,2,3,4. (extra 10)