# Import packages

In [None]:
import tweepy
from wordcloud import WordCloud
import nltk

Download two of the nltk corpora (see the full list at http://nltk.org/nltk_data/).

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

Install other packages.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer

# Obtain key codes to access Twitter's API

Sign in with your existing Twitter account (or make a new one) at https://twitter.com/.

Go to https://developer.twitter.com and register to be a developer. Select personal use when asked. Answer four questions indicating use on a student project. 

After completing the developer registration process, select "Create an app" to make a new application.
Fill out the form and feel free to use very generic information (i.e. App name: Your own name, Website URL: "https://www.darden.virginia.edu/", Description: Collect and analyze text). You can ignore the everything after Website URL, except for how the app will be used. Tell us how the app will be used: "This app will collect and analyze text for learning purposes. The app will be written in Python using the tweepy package." Click Create.

Go to "Keys and tokens" tab within the app you create, and create "Access token & access token secret". ("Consumer API keys" should already be generated.)

Go to the Permissions tab and change the access permission to "Read-only", because you are not using the code to write tweets back to the site.

Save your "Consumer API keys" and "Access token and access token secret" in a safe place. Put these four codes in place of "xxx" in cell below. Be sure to keep the quotation marks.

In [None]:
# This is commented out becuase we load in the tweets from a file
# consumer_key = "xxx"
# consumer_secret = "xxx"
# access_token = "xxx"
# access_token_secret = "xxx"

The cell below will provide access to Twitter's API.

In [None]:
# auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# auth.set_access_token(access_token, access_token_secret)
# api = tweepy.API(auth)

# Download some recent tweets

Download the 1000 latest tweets that use a key word. The cell below runs a for-loop in Python and appends each new tweet to the object called results, which is of class "list". In this class, we have the method "append", which is a function that appends the item inside the paratheses to the list.

In [None]:
# key_words = 'star wars'
# results = []
# for tweet in tweepy.Cursor(api.search, q = key_words, lang = 'en').items(1000): 
#     results.append(tweet)

In [None]:
# We load the tweets using the pickle libray instead of reading them from the twitter API
key_words = 'star wars'
import pickle

with open ('saved_tweets.pkl', 'rb') as fp:
    results = pickle.load(fp)

Print out the first five items in the list. These items in the resulting list are difficult to interpret. They are pieces of a raw json file (a standard file format for storing web-based data). Below we parse our list of tweets so that they are easier to interpret. 

In [None]:
results[0:5]

In [None]:
results[0]._json

Extract some information on each of the first five tweets.

In [None]:
for tweet in results[:5]:
    print(tweet.text, tweet.created_at, tweet.user.time_zone, tweet.user.screen_name, 
          tweet.user.followers_count, tweet.retweet_count)

Put the extracted tweet information into a data frame. First create an empty data frame. Then add columns for some key variables. Each column is a list (anything inside []), which is created with a for-loop. For instance, for each tweet in the big results list, go to its text (via tweet.text) and put it in the list.

In [None]:
df = pd.DataFrame()
df['tweetText'] = [tweet.text for tweet in results]
df['tweetCreated'] = [tweet.created_at for tweet in results]
df['userTimeZone'] = [tweet.user.time_zone for tweet in results]
df['userScreenName'] = [tweet.user.screen_name for tweet in results]

To add followers count and retweet count, we need to know if the tweet is a retweet or not. The best way to check (using try/except) is to see if the key 'retweet_status' is a part of the tweet's information. If it is, then we take the followers count of the person who tweeted the original tweet. In this case, we also take the retweet count of the original tweet. Otherwise, we grab the followers and retweet counts of the current tweet.

In [None]:
followers_count_list = []
for tweet in results:
    try:
        followers_count_list.append(tweet.retweeted_status.user.followers_count)
    except AttributeError:
        followers_count_list.append(tweet.user.followers_count)
df['followersCount'] = followers_count_list

retweet_count_list = []
for tweet in results:
    try:
        retweet_count_list.append(tweet.retweeted_status.retweet_count)
    except AttributeError:
        retweet_count_list.append(tweet.retweet_count)
df['retweetCount'] = retweet_count_list

In [None]:
df.head(10)

In [None]:
df.shape

# Clean the tweets

Convert the text in the tweets to all lower case. Remove the stopwords (such as 'a', 'the', and 'to') from the tweets. Keep each word's lemma, e.g., the verbs gone, going, and went have the lemma go. The lemmatizer needs to know the word's part of speech (pos). Use the regular expressions package re to remove http, RT, numbers and punctuations (except # and @), and whitespace.

In [None]:
stopset = list(set(stopwords.words('english')))
print(stopset)

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()
[wordnet_lemmatizer.lemmatize("gone", pos="v"), 
 wordnet_lemmatizer.lemmatize('going', pos="v"), 
 wordnet_lemmatizer.lemmatize('went', pos="v")]

In [None]:
clean_tweets_text = []
for tweet in df['tweetText']:  # Loop through the tokens (the words or symbols) in each tweet.    
    cleaned_tweet = re.sub(r"(RT)"," ", tweet)  # Remove RT.
    cleaned_tweet = cleaned_tweet.lower()  # Convert the text to lower case
    cleaned_tweet = ' '.join([word for word in cleaned_tweet.split() if word not in stopset])  # Keep only words that are not stopwords.
    cleaned_tweet = ' '.join([wordnet_lemmatizer.lemmatize(word, pos='n') for word in cleaned_tweet.split()])  # Keep each noun's lemma.
    cleaned_tweet = ' '.join([wordnet_lemmatizer.lemmatize(word, pos='v') for word in cleaned_tweet.split()])  # Keep each verb's lemma.
    cleaned_tweet = re.sub(r"amp"," ", cleaned_tweet)  # Remove the word 'amp'.
    cleaned_tweet = re.sub(r"(http\S+)"," ", cleaned_tweet)  # Remove http links.
    cleaned_tweet = re.sub("[^a-zA-Z#@]"," ", cleaned_tweet)  # Remove numbers and punctuations except # and @.
    cleaned_tweet = ' '.join(cleaned_tweet.split())  # Remove white space.
    cleaned_tweet = cleaned_tweet.replace(key_words, "")  # Replace your key words.
    clean_tweets_text.append(cleaned_tweet)

Add a column for the clean tweets to the existing data frame and print the new data frame.

In [None]:
df['cleanTweetText'] = clean_tweets_text
df[['tweetText', 'cleanTweetText']].head(10)

Some retweets will appear several times so we drop the duplicates. The remaining tweets will be less than 1000.

In [None]:
df = df.drop_duplicates(['cleanTweetText'], keep='first')
df.shape

In [None]:
df.describe()

Look at the correlation between followers count and retweet count. Why might we expect them to be correlated?

In [None]:
np.corrcoef(df['followersCount'], df['retweetCount'])

In [None]:
plt.scatter(df['followersCount'], df['retweetCount'])
plt.xlabel('Followers Count')
plt.ylabel('Retweet Count')
plt.show()

Should we apply a log transform to followers count and retweet count?

In [None]:
np.corrcoef(np.log(1+df['followersCount']), np.log(1+df['retweetCount']))

In [None]:
plt.scatter(np.log(1+df['followersCount']), np.log(1+df['retweetCount']))
plt.xlabel('log of Followers Count')
plt.ylabel('log of Retweet Count')
plt.show()

Look at the counts for each of retweet count levels. Are most tweets retweeted or not?

In [None]:
df['retweetCount'].value_counts()

Look at this same information in a histogram.

In [None]:
df['retweetCount'].hist(bins=100, range=(0,20))

Create two subsetted data frames -- one for tweets without any retweets and another for tweets that were retweeted.

In [None]:
df_no_retweet = df[df['retweetCount'] == 0]
df_some_retweet = df[df['retweetCount'] > 0]

# Create some wordclouds

Convert all the cleaned tweet texts into one long sentence. Then make a wordcloud.

In [None]:
one_long_string =  ' '.join(df['cleanTweetText'])

Replace variants of your key words.

In [None]:
one_long_string = one_long_string.replace('starwars', '')
one_long_string = one_long_string.replace('star', '')
one_long_string = one_long_string.replace('wars', '')
one_long_string = one_long_string.replace('war', '')

In [None]:
wordcloud = WordCloud().generate(one_long_string)
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis("off")
plt.title('Tweets with the key words: ' + key_words)
plt.show()

Create two more wordclouds -- one for tweets without any retweets and another for tweets that were retweeted.

In [None]:
one_long_string_no_retweet =  ' '.join(df_no_retweet['cleanTweetText'])
one_long_string_some_retweet =  ' '.join(df_some_retweet['cleanTweetText'])

In [None]:
one_long_string_no_retweet = one_long_string_no_retweet.replace('starwars', '')
one_long_string_no_retweet = one_long_string_no_retweet.replace('star', '')
one_long_string_no_retweet = one_long_string_no_retweet.replace('wars', '')
one_long_string_no_retweet = one_long_string_no_retweet.replace('war', '')
one_long_string_some_retweet = one_long_string_some_retweet.replace('starwars', '')
one_long_string_some_retweet = one_long_string_some_retweet.replace('star', '')
one_long_string_some_retweet = one_long_string_some_retweet.replace('wars', '')
one_long_string_some_retweet = one_long_string_some_retweet.replace('war', '')

Are there any differences in the frequently used words in the non-retweeted and retweeted tweets?

In [None]:
wordcloud_no_retweet = WordCloud(random_state=201).generate(one_long_string_no_retweet)
wordcloud_some_retweet = WordCloud(random_state=201).generate(one_long_string_some_retweet)

plt.figure(figsize=(15, 15))

plt.subplot(121)  # 121 means 1 row and 2 columns of plots and this is the first subplot.
plt.imshow(wordcloud_no_retweet)
plt.axis("off")
plt.title('Non-retweeted tweets with the key words: ' + key_words)

plt.subplot(122)  # 122 means 1 row and 2 columns of plots and this is the second subplot.
plt.imshow(wordcloud_some_retweet)
plt.axis("off")
plt.title('Retweeted tweets with the key words: ' + key_words)

plt.show()

# Create a bag of words

Create a corpus (a list of all your documents). Find all the one-word phrases (unigrams) and two-word phrases (bigrams). We could keep going higher, to find all n-word phrases (ngrams). Below we create a "bag of words" for the top 20 phrases (unigrams or bigrams in this case). A bag of words (or document-term matrix) is a data frame of phrase counts. Each row is a document (or tweet in this case). The columns correspond to a phrase in any of the documents. An entry in the data frame is a count of the times the phrase appears in the document.

In [None]:
corpus = list(df['cleanTweetText'])
corpus_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=30)
bag_of_words = corpus_vectorizer.fit_transform(corpus)
bag_of_words_df = pd.DataFrame(bag_of_words.toarray(), columns=corpus_vectorizer.get_feature_names())
bag_of_words_df

In [None]:
pd.DataFrame(bag_of_words_df.mean(axis=0), index=bag_of_words_df.columns, columns=['Avg count (all tweets)'])

Create two bag of words -- one for tweets with no retweets and another for tweets with some retweets. Are the frequently used words different?

In [None]:
corpus_no_retweet = list(df_no_retweet['cleanTweetText'])
corpus_no_retweet_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=20)
bag_of_words_no_retweet = corpus_no_retweet_vectorizer.fit_transform(corpus_no_retweet)
bag_of_words_no_retweet_df = pd.DataFrame(bag_of_words_no_retweet.toarray(), columns=corpus_no_retweet_vectorizer.get_feature_names())
bag_of_words_no_retweet_df

In [None]:
corpus_some_retweet = list(df_some_retweet['cleanTweetText'])
corpus_some_retweet_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=20)
bag_of_words_some_retweet = corpus_some_retweet_vectorizer.fit_transform(corpus_some_retweet)
bag_of_words_some_retweet_df = pd.DataFrame(bag_of_words_some_retweet.toarray(), columns=corpus_some_retweet_vectorizer.get_feature_names())
bag_of_words_some_retweet_df

In [None]:
freq_words_no_retweet = pd.DataFrame(bag_of_words_no_retweet_df.mean(axis=0), 
                                     index=bag_of_words_no_retweet_df.columns, columns=['Avg count (no retweet)'])
freq_words_some_retweet = pd.DataFrame(bag_of_words_some_retweet_df.mean(axis=0), 
                                     index=bag_of_words_some_retweet_df.columns, columns=['Avg count (some retweet)'])

In [None]:
freq_words_no_retweet.join(freq_words_some_retweet, how='outer')

# Sentiment dictionary

Download hedonometer's sentiment dictionary from http://hedonometer.org/index.html.

In [None]:
import json
import urllib.request
url='http://hedonometer.org/api/v1/words/?format=json'
data = urllib.request.urlopen(url).read().decode('utf-8')
loaded_json = json.loads(data)
loaded_json    

In [None]:
happ_dict = loaded_json['objects']
happ_dict

In [None]:
from pandas.io.json import json_normalize
happ_df = json_normalize(happ_dict)
happ_df.head()

In [None]:
happ_df.tail()

In [None]:
word_happs_df = happ_df[['word', 'happs']]

In [None]:
word_happs_df.shape

# Feature engineering

Create a dummy variable a tweet containing a frequently used word, such as 'force'. Also, create a count of the number of handles in each tweet. In addition, calculate each tweet's happiness score (a sum of the words' happiness scores).

In [None]:
df = df.reset_index(drop=True)  # Reset the index of the data frame.

In [None]:
force_dummy = []
handle_count = []
happs_list = []
for i in range(0, len(df)):
    tweet = df.loc[i]['cleanTweetText']
    force_dummy.append(int('resistance' in set(tweet.split())))
    handle_count.append(tweet.count('@')) 
    tweet_df = pd.DataFrame(pd.Series(tweet.split()), columns=['word'])  # Create a single column data frame of tweet's words.
    tweet_happs_df = pd.merge(tweet_df, word_happs_df, on='word')
    happs_list.append(tweet_happs_df['happs'].sum())
df['contains_resistance'] = force_dummy
df['handle_count'] = handle_count
df['happ_score'] = happs_list

In [None]:
df.head(10)

# Fit a regression tree

In [None]:
ind_variables_selected = ['followersCount', 'contains_resistance', 'handle_count', 'happ_score']
from sklearn.tree import DecisionTreeRegressor
X_train = df[ind_variables_selected]
y_train = df['retweetCount']
rt = DecisionTreeRegressor(min_samples_split=2, max_depth=20, random_state=201)
rt_model = rt.fit(X_train, y_train)
rt_pred = rt_model.predict(X_train)

Look at the variable importances.

In [None]:
pd.DataFrame(rt_model.feature_importances_, index=ind_variables_selected)

In [None]:
plt.scatter(rt_pred, y_train)

Visualize your regression tree.

In [None]:
from io import StringIO
from sklearn.tree import export_graphviz
import pydotplus
from IPython.display import Image

In [None]:
# Create a string buffer dot_data 
dot_data = StringIO()
# This function export the decision tree to the string buffer "dot_data" in Graphviz’s Dot format. 
export_graphviz(rt_model, out_file = dot_data, feature_names = ind_variables_selected, rounded = True,  
                proportion = True, rotate = 1, filled = True, node_ids=True)
# Create a Python interface to Graphviz’s Dot language.
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
# Save your regression tree. Open the PDF file from the folder location of this code. 
Image(graph.create_png())
graph.write_pdf("regressionTree.pdf") 