In [1]:
import os
import re
import string
import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [11]:
os.chdir('/Users/biancaorozco/Desktop/Metis/project4/medium/data/')
df = pd.read_csv('Medium_Clean.csv').sample(50000)

In [12]:
# Only want 4 columns for my new dataset
newdf = df.loc[:, ['Title', 'Subtitle', 'Claps']]

# Claps from floats to integers
newdf['Claps'] = newdf['Claps'].astype(int)

data = newdf.dropna()

In [13]:
# Remove Blog posts in between popular (>1000 claps) and unpopular (<200)
index_drop = data[(data['Claps']>200) & (data['Claps']<1000)].index
data.drop(index_drop, inplace=True)

# Set certain number of claps to popular, the rest unpopular
data['Sentiment'] = np.where(data['Claps'] >= 1000, 'popular', 'unpopular')

# Include only the Sentiment and Title columns
df = data[['Sentiment', 'Title']]
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Sentiment,Title
1237392,unpopular,Just Another Day at the Lunch Counter
284539,unpopular,Bubble Shooter Source Code
295371,unpopular,How to Earn Big-Time Cash with Bounty!
376744,unpopular,http://www.healthbeautyfacts.com/max-test-ultr...
1119349,unpopular,Android?


In [14]:
# Preprocessing Titles
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

data['Title'] = data['Title'].map(alphanumeric).map(punc_lower)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,Title,Subtitle,Claps,Sentiment
1237392,just another day at the lunch counter,what it means to drop on one knee,199,unpopular
284539,bubble shooter source code,Balloon pop is Associate in Nursing classic an...,0,unpopular
295371,how to earn big time cash with bounty,As we enter the final stretch of our Costco Gr...,71,unpopular
376744,http www healthbeautyfacts com max test ultr...,Max Test Ultra in spite of the program of the ...,0,unpopular
1119349,android,2007AndroidAndroidAndroid Sundar PichaiAndroid,70,unpopular


In [15]:
# Split the data into X and y data sets
X = data['Title']
y = data['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [16]:
# Create TF-IDF 
tfidf1 = TfidfVectorizer(stop_words='english', min_df = 3, max_df = 0.9)
tfdoc1 = tfidf1.fit_transform(X_train)

pd.DataFrame(tfdoc1.toarray(), columns=tfidf1.get_feature_names()).head()

Unnamed: 0,abandoned,abc,ability,able,abroad,absolute,absolutely,abstract,abu,abuse,...,youth,youtube,youve,zealand,zen,zero,zip,zippyshare,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Create TF-IDF
tfidf2 = TfidfVectorizer(ngram_range=(1,2), binary=True, stop_words='english', min_df = 3, max_df = 0.9)
tfdoc2 = tfidf2.fit_transform(X_train)

pd.DataFrame(tfdoc2.toarray(), columns=tfidf2.get_feature_names()).head()

Unnamed: 0,aaa,aaa reserve,aadhaar,aadhaar card,aae,aae exam,aantrekkelijk,aantrekkelijk te,aaron,aaron mobile,...,zullen geven,zulmi,zulmi fan,zum,zum ironman,zum sonntag,zvi,zvi heckers,zyplex,zzzs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(tfdoc1)
print('TF-IDF (unigrams)\nLSA Variance Ratio:', lsa.explained_variance_ratio_)

TF-IDF (unigrams)
LSA Variance Ratio: [0.00504626 0.00386289 0.00323795 0.00328163 0.00294868]


In [19]:
topic_word = pd.DataFrame(lsa.components_.round(3), 
                          index = ['c1', 'c2', 'c3', 'c4', 'c5'], 
                          columns = tfidf1.get_feature_names())
topic_word

Unnamed: 0,abandoned,abc,ability,able,abroad,absolute,absolutely,abstract,abu,abuse,...,youth,youtube,youve,zealand,zen,zero,zip,zippyshare,zone,zoom
c1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002,0.0,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0
c2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.001,0.001,0.002,0.0,0.0,0.004,0.0,0.0,0.0
c3,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.001,0.001,...,0.0,0.003,0.003,0.013,0.001,0.0,-0.001,0.0,0.002,0.0
c4,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,...,-0.0,0.005,0.006,-0.005,0.0,-0.0,-0.0,-0.0,-0.001,0.0
c5,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.001,...,0.002,0.001,0.001,-0.005,0.004,0.0,-0.0,0.0,-0.0,0.0


In [20]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [21]:
display_topics(lsa, tfidf1.get_feature_names(), 10)


Topic  0
day, life, new, time, days, photo, week, padres, challenge, series

Topic  1
love, new, life, story, design, time, self, world, dont, learning

Topic  2
new, life, design, time, year, things, best, social, week, media

Topic  3
life, social, media, lessons, change, data, work, story, dont, make

Topic  4
design, web, best, development, social, media, things, good, ux, make


In [22]:
Vt = pd.DataFrame(doc_topic.round(5),
             index = list(X_train),
             columns = ['c1', 'c2', 'c3', 'c4', 'c5'])
Vt

Unnamed: 0,c1,c2,c3,c4,c5
quick checkout and rebase in git,0.00040,0.00088,0.00295,0.00216,0.00146
lining up your structure and your strategy,0.00076,0.00218,0.00876,-0.00169,0.01325
to gamble or to stop losing money,0.00796,0.01710,0.01795,0.00890,0.02410
calling all millennials how do you protect public lands,0.00040,0.00084,0.00247,0.00021,0.00228
the nba cap n crunch part i,0.00134,0.00110,0.00594,-0.00166,-0.00061
...,...,...,...,...,...
cloudsight aithe official redesign,0.00136,0.00058,0.00292,-0.00087,0.00156
saudi arabia is more interested in pr spin than womens rights,0.00222,0.00053,0.00235,0.00005,0.00091
cryptocurrency can now be given to children,0.00077,0.00311,0.00767,-0.00271,0.00493
http evaherbalist com vitrixa ageless,0.00158,0.00287,0.00463,-0.00877,0.00634


In [23]:
cosine_similarity(tfdoc1).round(3)

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [None]:
cosine_similarity(lsa.transform(tfdoc1)).round(3)