In [1]:
import pandas as pd
import re
from textblob import TextBlob
import datascience as ds

Task 1 - Read the xls file

In [2]:
file_name = r'data/samples/Sample1.xls'
mycols = ['GUID', 'Date', 'URL', 'Contents','Author','Name','Country','State/Region','City/UrbanArea', 'Category', 'Source', 'Gender', 'Posts', 'Followers', 'Following', 'PostTitle', 'PostType', 'ImageURL', 'Brand']
data = pd.read_excel(file_name, index_col = None, header = None, names = mycols).iloc[2:]

In [3]:
print(data.Source.unique())

['Twitter' 'Tumblr' 'News' 'Reddit' 'Forums' 'YouTube' 'Blogs' 'Comments']


Task 2 - Removing Tumblr records and keeping only the Twitter ones

In [4]:
indexNames = data[ data['Source'] == 'Tumblr' ].index
data.drop(indexNames , inplace=True)
print(data.Source.unique())

['Twitter' 'News' 'Reddit' 'Forums' 'YouTube' 'Blogs' 'Comments']


In [5]:
rslt_df = data[data['Source'] == 'Twitter'] 

In [6]:
print(rslt_df.Source.unique())

['Twitter']


In [7]:
print(rslt_df)

                     GUID                 Date  \
2      921898005548036096  2017-10-22 00:36:27   
3      924449998972088320  2017-10-29 01:37:09   
4      920046800492466178  2017-10-16 22:00:25   
5      923767728128888833  2017-10-27 04:26:03   
6      921388952388554752  2017-10-20 14:53:39   
...                   ...                  ...   
9995   924647308507131904  2017-10-29 14:41:12   
9996   923560765008867329  2017-10-26 14:43:39   
9997   922265988724338688  2017-10-23 00:58:41   
10000  923905913362239489  2017-10-27 13:35:09   
10001  921730014311337984  2017-10-21 13:28:54   

                                                     URL  \
2      http://twitter.com/cuzwkxbmug1830/status/92189...   
3      http://twitter.com/PoetFedericoDJ/status/92444...   
4      http://twitter.com/niwandajones/status/9200468...   
5      http://twitter.com/PuddnHaid/status/9237677281...   
6      http://twitter.com/littletuans/status/92138895...   
...                                    

Task 3 - Sentiment polarity of each tweet

In [8]:
tweets = rslt_df['Contents']
print(tweets)

2        RT @Ththeforce After a party @parisreview , I ...
3        RT @AshleyJudd Yet when we get together in gro...
4        RT @apbenven Reminder that if a woman didn't p...
5        @PuestoLoco Entitlement. They Feel They're Ent...
6        RT @soompi f(x)’s Amber Adds Her Own Thoughts ...
                               ...                        
9995     RT @AryesOfficial As the #MeToo campaign is go...
9996     Important steps you can take when faced w/ #se...
9997     RT @TheRoot While most people think #MeToo is ...
10000    RT @anne_theriault French equivalent of #metoo...
10001    RT @disspat Pretty sure you oversexualize your...
Name: Contents, Length: 8893, dtype: object


In [9]:
def preprocess(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) 1|(\w+:\/\/\S+)", " ", tweet).split())

In [10]:
def polarity(tweet_after_preprocess):
    analysis = TextBlob(tweet_after_preprocess)
    if analysis.sentiment.polarity > 0: 
        return 'positive'
    elif analysis.sentiment.polarity == 0: 
        return 'neutral'
    else: 
        return 'negative'

In [11]:
print(rslt_df.PostType.unique())

['Retweet' 'Reply' 'Tweet']


In [12]:
selected_tweets=[]
polarity_of_tweets = []
for tweet in tweets:
    if tweet not in selected_tweets:
        selected_tweets.append(tweet)
        polarity_of_tweets.append(polarity(preprocess(tweet)))
final_polarity_table = ds.Table().with_columns('Tweet', selected_tweets, 'Polarity', polarity_of_tweets)
print(final_polarity_table)

Tweet                                                        | Polarity
RT @Ththeforce After a party @parisreview , I couldn't s ... | neutral
RT @AshleyJudd Yet when we get together in groups, we di ... | neutral
RT @apbenven Reminder that if a woman didn't post #MeToo ... | positive
@PuestoLoco Entitlement. They Feel They're Entitled. The ... | positive
RT @soompi f(x)’s Amber Adds Her Own Thoughts To “#MeToo ... | positive
Whaat? Weinstein tells pals scandal happened so he could ... | neutral
Twitter is not a platform for abuse. 31 Oct, 17 05:44 pm ... | neutral
RT @NicoleCorrado16 To ALL complicit corrupt govt MONSTE ... | negative
RT @Alyssa_Milano Weinstein, Empowerment And #MeToo As T ... | neutral
The latest The Hiroshi Suzuki Daily! https://t.co/PXTmqJ ... | positive
... (5752 rows omitted)
