Name : Charles Patel <br>
Email: charlespatel007@yahoo.com

## Features extracted from article text

``article text`` = "This is sample text of article"

Features to be extracted from above ``article text``
1. Numbers of tokens
2. Numbers of unique tokens
3. Average token length
4. N non-stop unique tokens
5. Global subjectivity
6. Avg positive polarity
7. Global sentiment polarity

In [2]:
import nltk
import httplib2
import pandas as pd
import numpy as np

from textblob import TextBlob
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from bs4 import BeautifulSoup, SoupStrainer

In [3]:
stopwords = set(stopwords.words('english'))

In [3]:
def tokenizetext(text):
    return word_tokenize(text)


def words(text):
    return [word for word in word_tokenize(text) if word.isalpha()]


def unique_words(text):
    return list(set(words(text)))


def rate_uni_words(text):
    return round(len(unique_words(text))/len(words(text)), 2)


def avg_length_token(text):
    w = words(text)
    count = 0
    for item in w:
        count += len(item) 
    return count/len(w)


def n_non_stop_unique_tokens(text):
    uw = unique_words(text)
    n_uw = [item for item in uw if item not in stopwords]
    w = words(text)
    n_w = [item for item in w if item not in stopwords]
    rate_nsut = len(n_uw)/len(n_w)
    return rate_nsut


def num_links(link):
    try:
        http = httplib2.Http()
        status, response = http.request(link)

        count = sum([1 for link in BeautifulSoup(response, parse_only=SoupStrainer('a'))
                 if link.has_attr('href')])
        return count
    except:
        return 0


def get_subjectivity(text):
    return text.sentiment.subjectivity


def get_polarity(text):
    return text.sentiment.polarity


def word_polarity(words):
    pos_words, pos_words_polarity = [], []
    neg_words, pneg_words_polarity = [], []
    neu_words, pneu_words_polarity = [], []
    
    for word in words:
        an_word = TextBlob(word)
        val = an_word.sentiment.polarity
        if val > 0:
            pos_words.append(word)
            pos_words_polarity.append(val)
        if val < 0:
            neg_words.append(word)
            pneg_words_polarity.append(val)
        if val == 0 :
            neu_words.append(word)
            pneu_words_polarity.append(val)
            
    return (pos_words, pos_words_polarity, 
            neg_words, pneg_words_polarity, 
            neu_words, pneu_words_polarity)


def avg_pol_pw(text):    
    res = word_polarity(words(text))
    return np.sum(res[1])/len(res[0])


def avg_pol_nw(text):    
    res = word_polarity(words(text))
    return np.sum(res[3])/len(res[2])

In [5]:
articles = pd.read_csv('data/shared_articles.csv')
articles

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3117,1487946604,CONTENT SHARED,9213260650272029784,3609194402293569455,7144190892417579456,Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebK...,SP,BR,HTML,https://startupi.com.br/2017/02/liga-ventures-...,"Conheça a Liga IoT, plataforma de inovação abe...","A Liga Ventures, aceleradora de startups espec...",pt
3118,1487947067,CONTENT SHARED,-3295913657316686039,6960073744377754728,-8193630595542572738,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3...,GA,US,HTML,https://thenextweb.com/apps/2017/02/14/amazon-...,Amazon takes on Skype and GoToMeeting with its...,"Amazon has launched Chime, a video conferencin...",en
3119,1488223224,CONTENT SHARED,3618271604906293310,1908339160857512799,-183341653743161643,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0...,SP,BR,HTML,https://code.org/about/2016,Code.org 2016 Annual Report,"February 9, 2017 - We begin each year with a l...",en
3120,1488300719,CONTENT SHARED,6607431762270322325,-1393866732742189886,2367029511384577082,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,MG,BR,HTML,https://www.bloomberg.com/news/articles/2017-0...,JPMorgan Software Does in Seconds What Took La...,"At JPMorgan Chase & Co., a learning machine is...",en


In [6]:
def get_text_features(content_id, article, url):
    row = {}
    analysed_text = TextBlob(article)
    
    row['contentId'] = content_id
    row['tokens'] = len(words(article))
    row['unique_tokens'] = len(unique_words(article))
    row['average_token_length'] = avg_length_token(article)
    row['n_non_stop_unique_tokens'] = n_non_stop_unique_tokens(article)
    row['global_subjectivity'] = get_subjectivity(analysed_text)
    row['avg_positive_polarity'] = avg_pol_pw(article)
    row['global_sentiment_polarity'] = get_polarity(analysed_text)
    
    return row

In [7]:
data = pd.DataFrame(columns = ['contentId', 'tokens', 'unique_tokens', 'average_token_length',
                               'n_non_stop_unique_tokens', 'global_subjectivity', 
                               'avg_positive_polarity', 'global_sentiment_polarity'])

for i in range(0, len(articles)):
    content_id = articles.iloc[i]['contentId']
    article = articles.iloc[i]['text']
    url = articles.iloc[i]['url']

    features = get_text_features(content_id, article, url)
    data = data.append(features, ignore_index = True)

  return np.sum(res[1])/len(res[0])


In [8]:
data.dtypes

contentId                    float64
tokens                       float64
unique_tokens                float64
average_token_length         float64
n_non_stop_unique_tokens     float64
global_subjectivity          float64
avg_positive_polarity        float64
global_sentiment_polarity    float64
dtype: object

In [9]:
data.contentId = articles.contentId

In [13]:
data = data.fillna(0)

In [14]:
data.dtypes

contentId                      int64
tokens                       float64
unique_tokens                float64
average_token_length         float64
n_non_stop_unique_tokens     float64
global_subjectivity          float64
avg_positive_polarity        float64
global_sentiment_polarity    float64
dtype: object

In [15]:
data.to_csv('data/article_info.csv', index=False)