# Importing Packages

In [None]:
from TwitterAPI import TwitterAPI
import pandas as pd
import json
import time
from textblob import TextBlob


# Twitter API

In [None]:
consumer_key = 'StFnoZ0aLRsGBfdXKsvgixh2G'
consumer_secret = 'DXAzF6aSRrbmiJ1cDk0N9NllpRqsZsCcd5SWsj0bSVsKzoFUCR'

access_token_key = '1013584071589588992-S5tkore4k0a033GoToVVqZBsN8awRR'
access_token_secret = '4IGXFqW6V0TPIbSy73QffNKhtscbv7p8HmYBvzKeD1gcN'

api = TwitterAPI(consumer_key, consumer_secret, access_token_key, access_token_secret)

# Request data from Twitter API

In [None]:
def get_df_from_tweets(r):
    '''
    Converts the response from TwitterAPI into a Pandas dataframe.
            Parameters:
                    r (TwitterResponse): The response object after calling the request method from the TwitterAPI.
            Returns:
                    df (DataFrame): A pandas dataframe containing the data.
    '''
    data_dict = r.json()['results']
    df = pd.read_json(json.dumps(data_dict), orient='records')
    return df


def get_df_from_search(params):
    '''
    Sends the query to the Twitter Premium Search API.
            Parameters:
                    params (dict): A dictionary of parameters for the Twitter search API.
                                   See this page for the possible query parameters: https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/guides/premium-operators
            Returns:
                    df (DataFrame): A pandas dataframe containing the data.
                    next_ (str): The next token to get the next page of results from the Twitter search API. 
                                 Read about pagination here: https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/api-reference/premium-search
    '''
    PRODUCT = '30day'
    LABEL = 'justintodata'
    r = api.request('tweets/search/{}/:{}'.format(PRODUCT, LABEL), params)
    print(r.status_code)
    print(r.get_quota())
    next_ = r.json().get('next')
    return get_df_from_tweets(r), next_


def get_data(search_term, api, max_queries=5):
    '''
    Retrieves multiple pages of tweets with the specified search term.
            Parameters:
                    search_term (str): The string to search in the tweet.
                    api (TwitterAPI): The TwitterAPI object.
                    max_queries: The maximum number of queries (pages) to retrieve.
            Returns:
                    (DataFrame): A pandas dataframe containing the data.
    '''
    delay_seconds = 2
    df_list = []
    
    # query the search term. The higher maxResults you can put is 100 for our twitter account.
    params = {'query':search_term, 'maxResults': 100}
    df, next_ = get_df_from_search(params)
    df_list.append(df)
    time.sleep(delay_seconds)
    
    # use the next token to get the next page of tweets if we want more than 100.
    num_queries_remaining = max_queries - 1
    while num_queries_remaining > 0 and next_:
        params['next'] = next_
        df, next_ = get_df_from_search(params)
        df_list.append(df)
        num_queries_remaining -= 1
        time.sleep(delay_seconds)  # must delay the search or else Twitter will say we're sending too many requests.
        
    return pd.concat(df_list).reset_index(drop=True)

In [None]:
# Get the data for keyword.
df_twitterkeyword = get_data('@putin', api, max_queries=30)

# Process the data and Apply the TextBlob model

In [None]:
# The column user is in dictionary format. This function unpacks it into 4 separate columns.
def flatten_user_info(df):
    def unpack_user(d):
        return d.get('id'), d.get('name'), d.get('screen_name'), d.get('location')
    return pd.DataFrame(df['user'].map(unpack_user).to_list(), index=df.index)


def get_full_text(df):
    def unpack_extended_tweet(d):
        if not isinstance(d,dict):
            return None
        return d.get('full_text')    
    
    return df['extended_tweet'].map(unpack_extended_tweet)

def get_sentiment(df, txt_col):
    return df[txt_col].map(lambda txt: TextBlob(txt).sentiment.polarity)


def prepare_data(df):
    # filter out retweets. We're only interested in the originals.
    msk = (~df['text'].str.startswith('RT'))
    df_filtered = df[msk].copy()
    
    # get the user information in separate columns.
    df_filtered[['user_id', 'username', 'user_screen_name', 'user_location']] = flatten_user_info(df_filtered)
    
    # get the full_text if it exists. Otherwise fill it in with the text.
    df_filtered['full_text'] = get_full_text(df_filtered)
    msk = df_filtered['full_text'].isnull()
    df_filtered.loc[msk, 'full_text'] = df_filtered.loc[msk, 'text']
    
    # get the sentiment of the full_text.
    df_filtered['textblob_sentiment'] = get_sentiment(df_filtered, 'full_text')
    return df_filtered
    

In [None]:
df_twitterkeyword = prepare_data(df_twitterkeyword)

df_twitterkeyword.info()

df_twitterkeyword.head()

# Label a sample manually

In [None]:
df_twitterkeyword.sample(n=100).to_csv('twitter-data.csv', index=False)

df_labelled = pd.read_csv('twitter-data-labeled.csv')

df_labelled.head(10)

In [None]:
from sklearn.preprocessing import label_binarize

df_labelled[['is_neg', 'is_neutral', 'is_pos']] = pd.DataFrame(label_binarize(df_labelled['label'], [-1, 0, 1]), index=df_labelled.index)

df_labelled.head(10)

df_labelled['is_neg'].value_counts(dropna=False)

df_labelled['is_pos'].value_counts(dropna=False)

df_labelled['is_neutral'].value_counts(dropna=False)

# Evaluate the sentiment analysis results

In [None]:
from sklearn.metrics import roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt

In [None]:
# This function is based off of this example: 
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py
def plot_roc_curve(fpr, tpr, roc_auc):
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

## Negative Tweets

In [None]:
neg_fpr, neg_tpr, neg_thresholds = roc_curve(df_labelled['is_neg'], -df_labelled['textblob_sentiment'], pos_label=1)
neg_roc_auc = auc(neg_fpr, neg_tpr)

plot_roc_curve(neg_fpr, neg_tpr, neg_roc_auc)

In [None]:
for i, t in enumerate(neg_thresholds):
    neg_pred = -df_labelled['textblob_sentiment'] > t
    acc = accuracy_score(df_labelled['is_neg'], neg_pred)
    print('threshold: {}, accuracy: {}'.format(-t, acc))

## Positive Tweets

In [None]:
pos_fpr, pos_tpr, pos_thresholds = roc_curve(df_labelled['is_pos'], df_labelled['textblob_sentiment'], pos_label=1)
pos_roc_auc = auc(pos_fpr, pos_tpr)

plot_roc_curve(pos_fpr, pos_tpr, pos_roc_auc)

In [None]:
for i, t in enumerate(pos_thresholds):
    pos_pred = df_labelled['textblob_sentiment'] > t
    acc = accuracy_score(df_labelled['is_pos'], pos_pred)
    print('threshold: {}, accuracy: {}'.format(t, acc))

In [None]:
df_twitterkeyword['predicted_sentiment'] = pd.cut(df_twitterkeyword['textblob_sentiment'],
                                             bins=[-2, -0.05, 0.2857, 2], 
                                             labels=['negative', 'neutral', 'positive'], 
                                             right=False)

In [None]:
df_twitterkeyword[['full_text', 'textblob_sentiment', 'predicted_sentiment']].sample(n=10)

# Explore the results

In [None]:
df_twitterkeyword['created_at_hour'] = df_twitterkeyword['created_at'].dt.round('H')

aggregation = {'cnt': ('id', 'count')}
df_sentiment_by_time = df_twitterkeyword.groupby(['created_at_hour', 'predicted_sentiment']).agg(**aggregation).reset_index()

df_sentiment_by_time

In [None]:
import plotly.express as px
fig = px.line(df_sentiment_by_time, x="created_at_hour", y="cnt", color="predicted_sentiment")
fig.show()

## WordCloud

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
# stopwords do not appear in the wordcloud.
stopwords = STOPWORDS.copy()
stopwords.update(['http', 'https', 'co', 'starbuck', 'starbucks']) # add some additional stopwords.

# make all the text lowercase and combine everything together.
all_txt = [txt.lower() for txt in df_starbucks['full_text'].to_list()]
all_txt = ' '.join(all_txt)

# create and plot the wordcloud.
wordcloud = WordCloud(stopwords=stopwords, background_color="white", width=800, height=600).generate(all_txt)
plt.figure(figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()