# Import packages

In [None]:
from wordcloud import WordCloud
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

Import these two corpora. We will use them to clean our tweets.

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

Install other packages.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
import time

# Load Twitter data on 'entrepreneurship'

In [None]:
df = pd.read_csv('twitter_train.csv', encoding='latin-1')

In [None]:
df.head()

# Clean the tweets

Convert the text in the tweets to all lower case. Remove the stopwords (such as 'a', 'the', and 'to') from the tweets. Keep each word's lemma, e.g., the verbs gone, going, and went have the lemma go. The lemmatizer needs to know the word's part of speech (pos). Use the regular expressions package re to remove http, RT, numbers and punctuations (except # and @), and whitespace.

Below are three examples of lemmatized words.

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
wordnet_lemmatizer.lemmatize('entrepreneurs', pos='n')

In [None]:
wordnet_lemmatizer.lemmatize('bought', pos='v')

In [None]:
wordnet_lemmatizer.lemmatize('starting', pos='v')

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()
stopset = list(set(stopwords.words('english')))
clean_tweets_text = []
for tweet in df['Text']:  # Loop through the tokens (the words or symbols) in each tweet.   
    cleaned_tweet = re.sub(r"(RT)"," ", tweet)  # Remove RT.
    cleaned_tweet = cleaned_tweet.lower()  # Convert the text to lower case
    cleaned_tweet = ' '.join([word for word in cleaned_tweet.split() if word not in stopset])  # Keep only words that are not stopwords.
    cleaned_tweet = ' '.join([wordnet_lemmatizer.lemmatize(word, pos='n') for word in cleaned_tweet.split()])  # Keep each noun's lemma.
    cleaned_tweet = ' '.join([wordnet_lemmatizer.lemmatize(word, pos='v') for word in cleaned_tweet.split()])  # Keep each verb's lemma.
    cleaned_tweet = re.sub(r"amp"," ", cleaned_tweet)  # Remove the word "amp".
    cleaned_tweet = re.sub(r"(http\S+)"," ", cleaned_tweet)  # Remove http links.
    cleaned_tweet = re.sub("[^a-zA-Z#@]"," ", cleaned_tweet)  # Remove numbers and punctuations except # and @.
    cleaned_tweet = ' '.join(cleaned_tweet.split())  # Remove white space.
    cleaned_tweet = cleaned_tweet.replace('entrepreneurship', '')  # Replace your key words.
    cleaned_tweet = cleaned_tweet.replace('entrepreneur', '')  # Replace your key words.
    cleaned_tweet = cleaned_tweet.replace('entrepreneurial', '')  # Replace your key words.
    clean_tweets_text.append(cleaned_tweet)

In [None]:
df['cleanTweetText'] = clean_tweets_text
df.head()

Look at the counts for each of retweet count levels. Are most tweets retweeted or not?

In [None]:
new_df = df[df['RT_Count_in_TimeWindow'] < 10]
new_df['RT_Count_in_TimeWindow'].hist(bins=50)

Create two subsetted data frames -- one for tweets without any retweets and another for tweets that were retweeted.

In [None]:
df_no_retweet = df[df['RT_Count_in_TimeWindow'] == 0]
df_some_retweet = df[df['RT_Count_in_TimeWindow'] > 0]

In [None]:
df_some_retweet.shape

# Create some wordclouds

In [None]:
one_long_string =  ' '.join(df['cleanTweetText'])

In [None]:
wordcloud = WordCloud().generate(one_long_string)
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('All tweets')
plt.show()

In [None]:
one_long_string_no_retweet =  ' '.join(df_no_retweet['cleanTweetText'])
one_long_string_some_retweet =  ' '.join(df_some_retweet['cleanTweetText'])

In [None]:
len(one_long_string_no_retweet)

In [None]:
len(one_long_string_some_retweet)

In [None]:
one_long_string_no_retweet = one_long_string_no_retweet.replace('amp', '')
one_long_string_some_retweet = one_long_string_some_retweet.replace('amp', '')

In [None]:
wordcloud_no_retweet = WordCloud(random_state=201).generate(one_long_string_no_retweet)
wordcloud_some_retweet = WordCloud(random_state=201).generate(one_long_string_some_retweet)

plt.figure(figsize=(15, 15))

plt.subplot(121)  # 121 means 1 row and 2 columns of plots and this is the first subplot
plt.imshow(wordcloud_no_retweet)
plt.axis('off')
plt.title('Non-retweeted tweets')

plt.subplot(122)  # 122 means 1 row and 2 columns of plots and this is the second subplot
plt.imshow(wordcloud_some_retweet)
plt.axis('off')
plt.title('Retweeted tweets')

plt.show()

# Create a bag of words

Create a corpus (a list of all your documents). Find all the one-word phrases (unigrams) and two-word phrases (bigrams). We could keep going higher, to find all n-word phrases (ngrams). Below we create a "bag of words" for the top 20 phrases (unigrams or bigrams in this case). A bag of words (or document-term matrix) is a data frame of phrase counts. Each row is a document (or tweet in this case). The columns correspond to a phrase in any of the documents. An entry in the data frame is a count of the times the phrase appears in the document.

In [None]:
corpus = list(df['cleanTweetText'])
corpus_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=30)
bag_of_words = corpus_vectorizer.fit_transform(corpus)
bag_of_words_df = pd.DataFrame(bag_of_words.toarray(), columns=corpus_vectorizer.get_feature_names())
bag_of_words_df

In [None]:
pd.DataFrame(bag_of_words_df.mean(axis=0), index=bag_of_words_df.columns, columns=['Avg count (all tweets)'])

Create two bag of words -- one for tweets with no retweets and another for tweets with some retweets. Are the frequently used words different?

In [None]:
corpus_no_retweet = list(df_no_retweet['cleanTweetText'])
corpus_no_retweet_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=20)
bag_of_words_no_retweet = corpus_no_retweet_vectorizer.fit_transform(corpus_no_retweet)
bag_of_words_no_retweet_df = pd.DataFrame(bag_of_words_no_retweet.toarray(), columns=corpus_no_retweet_vectorizer.get_feature_names())

In [None]:
corpus_some_retweet = list(df_some_retweet['cleanTweetText'])
corpus_some_retweet_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=20)
bag_of_words_some_retweet = corpus_some_retweet_vectorizer.fit_transform(corpus_some_retweet)
bag_of_words_some_retweet_df = pd.DataFrame(bag_of_words_some_retweet.toarray(), columns=corpus_some_retweet_vectorizer.get_feature_names())

In [None]:
freq_words_no_retweet = pd.DataFrame(bag_of_words_no_retweet_df.mean(axis=0), 
                                     index=bag_of_words_no_retweet_df.columns, columns=['Avg count (no retweet)'])
freq_words_some_retweet = pd.DataFrame(bag_of_words_some_retweet_df.mean(axis=0), 
                                     index=bag_of_words_some_retweet_df.columns, columns=['Avg count (some retweet)'])

In [None]:
freq_words_no_retweet.join(freq_words_some_retweet, how='outer')

# Load a sentiment dictionary

Download hedonometer's sentiment dictionary from http://hedonometer.org/index.html.

In [None]:
import json
import urllib.request
url='http://hedonometer.org/api/v1/words/?format=json'
data = urllib.request.urlopen(url).read().decode('utf-8')
loaded_json = json.loads(data)
happ_dict = loaded_json['objects']
from pandas.io.json import json_normalize
happ_df = json_normalize(happ_dict)

In [None]:
happ_df.head()

In [None]:
happ_df.tail()

In [None]:
word_happs_df = happ_df[['word', 'happs']]

In [None]:
word_happs_df.shape

# Do some feature engineering

Create a dummy variable for when the tweet contains a frequently used word, such as 'smallbiz'. Also, create a count of the number of handles in each tweet. In addition, calculate each tweet's happiness score (a sum of the words' happiness scores). Include other features that you think might be important.

In [None]:
df = df.reset_index(drop=True)  # Reset the index of the data frame.

In [None]:
%%time
smallbiz_dummy = []
handle_count = []
happs_list = []
for i in range(0, len(df)):
    tweet = df.loc[i]['cleanTweetText']
    smallbiz_dummy.append(int('smallbiz' in set(tweet.split())))
    handle_count.append(tweet.count('@')) 
    tweet_df = pd.DataFrame(pd.Series(tweet.split()), columns=['word'])
    tweet_happs_df = pd.merge(tweet_df, word_happs_df, on='word')
    happs_list.append(tweet_happs_df['happs'].sum())
df['contains_smallbiz'] = smallbiz_dummy
df['handle_count'] = handle_count
df['happ_score'] = happs_list

In [None]:
df.head()

# Find the important features

Fit a regression tree using the log transformation of RT_Count_in_TimeWindow as the dependent variable. Thus, features will be important on the log scale (or on a percentage basis).

In [None]:
ind_variables_selected = ['Followers_Count', 'contains_smallbiz', 'handle_count', 'happ_score']
from sklearn.tree import DecisionTreeRegressor
X_train = df[ind_variables_selected]
y_train = np.log(1 + df['RT_Count_in_TimeWindow'])
rt = DecisionTreeRegressor(min_samples_split=20, random_state=201)
rt_model = rt.fit(X_train, y_train)

Which features are the most important?

In [None]:
pd.DataFrame(rt_model.feature_importances_, index=ind_variables_selected)