# Text Complexity

In this notebook:
* Clean data
* Create features
* Preprocess tweets for applying readability scores
* Applying scores and add them to dataframe
* Save dataframe with results

In [2]:
import pandas as pd
import json
import re
import syntok.segmenter as segmenter
import readability

**Example of using the readability library**

In [2]:
text = ('This is an example sentence .\n'
        'Note that tokens are separated by spaces and sentences by newlines .\n')

In [3]:
results = readability.getmeasures(text, lang='en')

In [4]:
results

OrderedDict([('readability grades',
              OrderedDict([('Kincaid', 7.442500000000003),
                           ('ARI', 5.825624999999999),
                           ('Coleman-Liau', 9.532550312500003),
                           ('FleschReadingEase', 55.95250000000002),
                           ('GunningFogIndex', 10.700000000000001),
                           ('LIX', 39.25),
                           ('SMOGIndex', 9.70820393249937),
                           ('RIX', 2.5),
                           ('DaleChallIndex', 9.954550000000001)])),
             ('sentence info',
              OrderedDict([('characters_per_word', 4.9375),
                           ('syll_per_word', 1.6875),
                           ('words_per_sentence', 8.0),
                           ('sentences_per_paragraph', 2.0),
                           ('type_token_ratio', 0.9375),
                           ('characters', 79),
                           ('syllables', 27),
                        

## Open tweets

In [3]:
path = '../Data/tweets_trump_20151109_20161109.json'

with open(path, encoding="utf8") as f:
    tweets = pd.read_json(f, orient='records', convert_axes=True, lines=False) # Json reader

In [4]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4969 entries, 0 to 4968
Data columns (total 7 columns):
source            4969 non-null object
text              4969 non-null object
created_at        4969 non-null datetime64[ns, UTC]
retweet_count     4969 non-null int64
favorite_count    4969 non-null int64
is_retweet        4969 non-null bool
id_str            4969 non-null int64
dtypes: bool(1), datetime64[ns, UTC](1), int64(3), object(2)
memory usage: 237.9+ KB


## Cleaning

And creating some features.

In [8]:
# Save screen name

def save_screen_name(user):
    return user['screen_name'].lower()

tweets['screen_name'] = tweets.apply(lambda x: save_screen_name(x.user), axis = 1)

In [5]:
tweets['user'] = 'Trump'

In [9]:
# Tweet is a reply: True/False

def is_reply(in_reply_to_screen_name):
    if in_reply_to_screen_name:
        return True
    else:
        return False

tweets['is_reply'] = tweets.apply(lambda x: is_reply(x.in_reply_to_screen_name), axis = 1)

In [10]:
# Number of hashtags, mentions and urls

def entities_count(entities, thing):
    return len(entities[thing])

tweets['n_urls'] = tweets.apply(lambda x: entities_count(x.entities, 'urls'), axis = 1)
tweets['n_hashtags'] = tweets.apply(lambda x: entities_count(x.entities, 'hashtags'), axis = 1)
tweets['n_mentions'] = tweets.apply(lambda x: entities_count(x.entities, 'user_mentions'), axis = 1)

In [11]:
# Source

def source(source):
    if source == '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>':
        return 'iphone'
    elif source == '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>':
        return 'android'
    elif source == '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>':
        return 'web'
    elif source == '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>':
        return 'ipad'
    else:
        return ''

tweets['tweet_source'] = tweets.apply(lambda x: source(x.source), axis = 1)

In [12]:
# Remove uninteresting columns
delete_columns = [
    'id_str', 'source', 'timestamp_ms', 'truncated', 'place',
    'contributors', 'is_quote_status', 'entities',
    'in_reply_to_status_id', 'in_reply_to_status_id_str',
    'in_reply_to_user_id', 'in_reply_to_user_id_str',
    'in_reply_to_screen_name', 'extended_entities',
    'geo', 'coordinates', 'place', 'quoted_status_id',
    'quoted_status_id_str', 'quoted_status']
tweets = tweets.drop(columns=delete_columns)

In [21]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6002 entries, 0 to 6001
Data columns (total 8 columns):
source            6002 non-null object
text              6002 non-null object
created_at        6002 non-null datetime64[ns, UTC]
retweet_count     6002 non-null int64
favorite_count    6002 non-null int64
is_retweet        6002 non-null bool
id_str            6002 non-null int64
user              6002 non-null object
dtypes: bool(1), datetime64[ns, UTC](1), int64(3), object(3)
memory usage: 334.2+ KB


## Preprocessing for using readability scores

In [6]:
# Regex patterns
mention_hashtag = r'([@#][\w_-]+)'
url = r'http.?://[^\s]+[\s]?'
line_break = r'\n'

def remove(pattern, text):
    return re.sub(pattern, '', text)

def tokenize(text):
    return '\n\n'.join(
     '\n'.join(' '.join(token.value for token in sentence)
        for sentence in paragraph)
     for paragraph in segmenter.analyze(text))

def clean_tweet(tweet):
    tweet = tokenize(tweet)
    tweet = remove(url, tweet)
    return str(tweet)

**Example of preprocessed tweet**

In [15]:
# Source: https://twitter.com/realDonaldTrump/status/1249081971604631554
example_tweet = """Wishing all a safe and blessed #Easter Sunday. I will be tuning into Pastor @robertjeffress
 at http://firstdallas.org/liveworship Church in Dallas, Texas tomorrow morning at 10:20 AM Eastern."""

example_tweet_clean = clean_tweet(example_tweet)

example_tweet_clean

'Wishing all a safe and blessed # Easter Sunday .\nI will be tuning into Pastor @ robertjeffress at Church in Dallas , Texas tomorrow morning at 10:20 AM Eastern .'

**Preprocess all tweets**

In [7]:
tweets['text_clean'] = tweets.apply(lambda x: clean_tweet(x.text), axis = 1)

## Apply readability scores

In [8]:
def tweet_readability(tweet):
    try:
        results = readability.getmeasures(tweet, lang='en')
    
        # Go through all results and add them to a new dict
        results_flattened = {}
        for result in results:
            for metric in results[result]:
                results_flattened[result + ' ' + metric] = results[result][metric]

        return pd.Series(results_flattened)
    except ValueError:
        return None

**Calculate readability scores**

In [9]:
tweets_readability = tweets.apply(lambda x: tweet_readability(x['text_clean']), axis = 1)

In [10]:
tweets_readability.sample(10)

Unnamed: 0,readability grades Kincaid,readability grades ARI,readability grades Coleman-Liau,readability grades FleschReadingEase,readability grades GunningFogIndex,readability grades LIX,readability grades SMOGIndex,readability grades RIX,readability grades DaleChallIndex,sentence info characters_per_word,...,word usage conjunction,word usage pronoun,word usage preposition,word usage nominalization,sentence beginnings pronoun,sentence beginnings interrogative,sentence beginnings article,sentence beginnings subordination,sentence beginnings conjunction,sentence beginnings preposition
4566,7.566667,11.504,13.977906,67.53,6.0,28.333333,3.0,2.0,15.959833,5.4,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2743,1.865,3.198276,3.853317,107.5175,8.558621,31.741379,8.477226,2.5,14.15639,3.689655,...,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
343,4.800238,4.902857,7.700693,79.348929,6.104762,43.833333,6.872983,3.5,10.172538,4.47619,...,0.0,4.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2696,-2.465185,1.212222,3.560035,128.766667,3.6,27.518519,3.0,1.666667,8.176604,3.851852,...,0.0,2.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3554,1.145,4.11125,-0.129036,129.6,9.6,32.333333,3.0,2.0,20.6169,2.875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3306,5.884286,7.437857,9.385127,77.810714,5.6,35.428571,3.0,3.0,6.586614,4.642857,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2519,5.896154,8.598462,4.319147,99.098846,10.4,26.0,3.0,0.0,9.784562,3.615385,...,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
809,-2.145,4.9425,8.430078,124.69,3.2,33.0,3.0,2.0,15.8758,4.75,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4403,6.803846,10.228846,6.35448,92.591154,15.015385,37.538462,12.486833,3.0,13.428408,3.961538,...,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
46,2.88,1.794,4.153084,83.32,6.0,35.0,6.872983,1.5,11.7795,4.4,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Join dataframes**

In [11]:
tweets = tweets.merge(tweets_readability, how = 'left', left_index = True, right_index = True)

In [12]:
tweets

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str,user,text_clean,readability grades Kincaid,...,word usage conjunction,word usage pronoun,word usage preposition,word usage nominalization,sentence beginnings pronoun,sentence beginnings interrogative,sentence beginnings article,sentence beginnings subordination,sentence beginnings conjunction,sentence beginnings preposition
0,Twitter for Android,"Don't let up, keep getting out to vote - this ...",2016-11-08 21:31:20+00:00,35805,116929,False,796102830352465920,Trump,"Do n't let up , keep getting out to vote - thi...",-2.257143,...,1.0,2.0,6.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,Twitter for Android,"Just out according to @CNN: ""Utah officials re...",2016-11-08 21:28:24+00:00,21504,47353,False,796102093727793152,Trump,"Just out according to @ CNN : "" Utah officials...",10.098571,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Twitter for Android,I will be watching the election results from T...,2016-11-08 21:18:04+00:00,24669,125385,False,796099494442057728,Trump,I will be watching the election results from T...,5.504474,...,1.0,2.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Twitter for iPhone,#ElectionDay https://t.co/MXrAxYnTjY https://t...,2016-11-08 18:23:39+00:00,25164,61186,False,796055597594578944,Trump,# Election Day Ax Yn Tj Y Oncih21,-6.117143,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Twitter Web Client,We need your vote. Go to the POLLS! Let's cont...,2016-11-08 18:03:49+00:00,23878,62105,False,796050609254395904,Trump,We need your vote .\nGo to the POLLS !\nLet 's...,-3.346111,...,0.0,4.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4964,Twitter Web Client,.@Betsy_McCaughey Thanks so much. Really appre...,2015-11-09 18:04:08+00:00,1109,2166,False,663779118761312256,Trump,. @ Betsy Mc Caughey Thanks so much .\nReally ...,1.215455,...,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4965,Twitter for Android,"""@dlustv: Trump SNL Episode Generates Highest ...",2015-11-09 00:55:18+00:00,1072,2616,False,663520201187590144,Trump,""" @ dlustv : Trump SNL Episode Generates Highe...",6.370000,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4966,Twitter for Android,Thank you to all of those who gave me such won...,2015-11-09 00:47:41+00:00,1744,6034,False,663518286915702784,Trump,Thank you to all of those who gave me such won...,3.445000,...,0.0,4.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4967,Twitter for Android,"""@TradingStreetCo:Donald Trump Is Ratings ‘Gol...",2015-11-08 23:31:27+00:00,783,2121,False,663499098780213248,Trump,""" @ Trading Street Co:Donald Trump Is Ratings ...",4.253636,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
tweets.describe()

Unnamed: 0,retweet_count,favorite_count,id_str,readability grades Kincaid,readability grades ARI,readability grades Coleman-Liau,readability grades FleschReadingEase,readability grades GunningFogIndex,readability grades LIX,readability grades SMOGIndex,...,word usage conjunction,word usage pronoun,word usage preposition,word usage nominalization,sentence beginnings pronoun,sentence beginnings interrogative,sentence beginnings article,sentence beginnings subordination,sentence beginnings conjunction,sentence beginnings preposition
count,4969.0,4969.0,4969.0,4968.0,4968.0,4968.0,4968.0,4968.0,4968.0,4968.0,...,4968.0,4968.0,4968.0,4968.0,4968.0,4968.0,4968.0,4968.0,4968.0,4968.0
mean,5820.947676,15073.893741,7.247643e+17,3.320991,5.631364,7.533526,91.207359,7.03356,32.239664,6.632969,...,0.338366,1.583333,1.718196,0.166264,0.312802,0.036836,0.080717,0.026167,0.007045,0.039251
std,7871.263537,16688.653234,4.03898e+16,4.502402,3.857677,3.835917,27.528073,3.818479,11.687679,3.175351,...,0.589863,1.368304,1.380461,0.420596,0.542887,0.192605,0.284005,0.160905,0.083647,0.197297
min,0.0,0.0,6.634978e+17,-15.2,-8.94,-21.86868,-12.785,0.4,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1949.0,4969.0,6.902727e+17,0.802198,2.881594,5.084441,74.015,4.066667,24.333333,3.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4101.0,11248.0,7.186363e+17,3.435238,5.091,7.350606,89.291154,6.533333,31.222222,6.872983,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7653.0,20602.0,7.60125e+17,6.272308,8.05,9.849929,105.186894,9.705263,40.0,8.477226,...,1.0,2.0,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,344806.0,573283.0,7.961028e+17,17.03,21.96,24.683945,205.82,22.0,104.0,17.491377,...,5.0,9.0,7.0,3.0,4.0,2.0,2.0,2.0,1.0,2.0


### Replace 'source' values

In [14]:
tweets['source'].value_counts()

Twitter for Android    2310
Twitter for iPhone     2138
Twitter Web Client      423
Twitter Ads              63
Twitter for iPad         22
Instagram                 7
TweetDeck                 2
Periscope                 2
Media Studio              1
Mobile Web (M5)           1
Name: source, dtype: int64

In [15]:
tweets = tweets.replace('Twitter for Android', 'Android')
tweets = tweets.replace('Twitter for iPhone', 'iPhone')
tweets = tweets.replace('Twitter Web Client', 'Web')
tweets = tweets.replace('Twitter for iPad', 'iPad')

In [16]:
tweets['source'].value_counts()

Android            2310
iPhone             2138
Web                 423
Twitter Ads          63
iPad                 22
Instagram             7
TweetDeck             2
Periscope             2
Media Studio          1
Mobile Web (M5)       1
Name: source, dtype: int64

## Save Dataframe

In [17]:
json_path = '../Data/tweets_trump_20151109_20161109_readability.json'

# Save full dataframe as csv
tweets.to_json(json_path, orient = 'columns')