### Importing libraries

In [34]:
import pandas as pd
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

from bs4 import BeautifulSoup
from multiprocessing import Pool
import re
import nltk
%matplotlib inline

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

### Importing data

In [2]:
data = pd.read_csv("../raw_data/local-dev/Questions.csv", encoding='latin1')

In [3]:
data['Title'].fillna("None", inplace=True)
data['Score'].fillna(0, inplace=True)

In [58]:
stopwords = nltk.corpus.stopwords.words('english')

In [59]:
print(stopwords[:5])

['i', 'me', 'my', 'myself', 'we']


In [5]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html5lib')
    return soup.getText()

In [10]:
data['combined_title_body'] = data['Title'].map(str) + data['Body']

In [7]:
corpus = data.loc[:, 'combined_title_body'].apply(remove_html_tags)

In [26]:
corpus[:1]

0    Migrating to Twitter API version 1.1 (?) [duplicated]I am quite new to Twitter API. I have updated Tweepy. I don't know what is wrong with this code and how to fix it to make it work for new version of Twitter API:\n\nimport oauth, tweepy \nfrom time import sleep\n\n#stars is confident information\nusername = "*******"\npassword = "***********"\nauth = tweepy.BasicAuthHandler(username, password)\napi = tweepy.API(auth)\n\napi.update_status('hello from tweepy!')\n\n\nTerminal is showing me this:\n\n$ python py/twi.py\nTraceback (most recent call last):\n  File "py/twi.py", line 11, in <module>\n    api.update_status('hello from tweepy!')\n  File "/usr/lib/python2.7/dist-packages/tweepy/binder.py", line 179, in _call\n    return method.execute()\n  File "/usr/lib/python2.7/dist-packages/tweepy/binder.py", line 162, in execute\n    raise TweepError(error_msg, resp)\n tweepy.error.TweepError: [{'message': 'The Twitter REST API v1 is no longer active. Please migrate to     API v1.1. ht

In [57]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adantonison/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adantonison/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adantonison/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

* Using a WordNetLemmatizer to improve the quality of the clustering
* See here for more inofrmation http://textminingonline.com/dive-into-nltk-part-iv-stemming-and-lemmatization

In [35]:
lem = WordNetLemmatizer()
def cond_tokenize(t):
    if t is None:
        return []
    else:
        return [lem.lemmatize(w.lower()) for w in word_tokenize(t)]

p = Pool(8)
tokens = list(p.imap(cond_tokenize, corpus))
p.close()

* Once tokenized, I now want to join all of the text back together into a single string

In [42]:
pure_tokens = [" ".join(sent) for sent in tokens]

* Here I am using the sklearn vectorizer to convert the raw documents to a matrix of TF-IDF features
* See http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [53]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(pure_tokens)

In [52]:
idfs = pd.DataFrame([[v, k] for k, v in vectorizer.vocabulary_.items()], columns=['id', 'word']).sort_values('id')
idfs['idf'] = vectorizer.idf_
idfs.sort_values('idf').head(10)

Unnamed: 0,id,word,idf
30,59334,python,1.549666
11,23364,code,1.964269
163,76044,using,1.992192
184,45175,like,2.06055
15,40576,import,2.180031
35,33583,file,2.187871
112,27653,def,2.247328
289,77437,want,2.249768
215,57690,print,2.275569
47,62542,return,2.340274


* Going to compress using a SVD

In [None]:
tsvd = TruncatedSCD(n_components=500)