# 1. Import Packages and Libraries

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import nx_altair as nxa
import tensorflow as tf
import keras as k
import pickle
import itertools

from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import gc
import time
from IPython.display import clear_output
import gif

%matplotlib inline

# 2. Read in Data

In [2]:
df = pd.read_csv('../../GenerativeAI tweets.csv').iloc[:,1:]
df['Datetime'] = pd.to_datetime(df['Datetime'])
df['Date'] = pd.to_datetime(df['Datetime'].apply(lambda x:x.date()))
df

Unnamed: 0,Datetime,Tweet Id,Text,Username,Date
0,2023-04-19 21:27:19+00:00,1648800467206672384,From Studio Gangster to Synthetic Gangster 🎤.....,resembleai,2023-04-19
1,2023-04-19 21:27:09+00:00,1648800425540476929,Took me some time to find this. I build this #...,devaanparbhoo,2023-04-19
2,2023-04-19 21:26:57+00:00,1648800376479715328,Mind blowing next wave #generativeai platform...,timreha,2023-04-19
3,2023-04-19 21:26:49+00:00,1648800341193027584,Open Source Generative AI Image Specialist Sta...,VirtReview,2023-04-19
4,2023-04-19 21:25:00+00:00,1648799883934203905,Are you an #HR leader considering which future...,FrozeElle,2023-04-19
...,...,...,...,...,...
56216,2022-04-24 16:40:01+00:00,1518268535276904448,"Understanding Generative AI, Its Impacts and L...",analyticsinme,2022-04-24
56217,2022-04-23 07:23:24+00:00,1517766068592381952,Y ya puedes empezar a crear #arte con @thegeni...,iia_es,2022-04-23
56218,2022-04-22 08:20:21+00:00,1517418013812830208,"NVIDIA researchers have developed GANverse3D, ...",VideoGenAI,2022-04-22
56219,2022-04-21 13:15:21+00:00,1517129866403008512,Tech Trend 2022: เทรนด์เทคโนโลยีสำหรับปี 2022 ...,sitthinuntp,2022-04-21


# --- Top Level Analyses: Tweet Quantity / Rate Over time and Tweet Sentiment Over Time ---

# 3. Tweet Quantity Over Time

Hypothesis: tweet volume related to generative ai has increased over time between April 2022 and April 2023

### Data Prep

In [3]:
daily_volume = df.groupby('Date').count()['Datetime']
daily_volume.name = 'Daily Volume'

avg_tweet_rate = np.cumsum(daily_volume) / np.arange(1,len(daily_volume) + 1)
avg_tweet_rate.name = 'Average Tweet Rate'

avg_tweet_rate_7 = daily_volume.rolling(window = 7).mean()
avg_tweet_rate_7.name = '7 Day Rolling Tweet Rate'

tweet_quantity = pd.concat([daily_volume,avg_tweet_rate, avg_tweet_rate_7], axis = 1)
tweet_quantity.reset_index(inplace = True)

tweet_quantity = round(tweet_quantity, 2)

final_tweet_quantity = pd.DataFrame()

for col in ['Daily Volume','Average Tweet Rate','7 Day Rolling Tweet Rate']:
    
    subset = tweet_quantity[['Date',col]].copy()
    subset.columns = ['Date','Tweet Volume']
    subset['Metric'] = col
    final_tweet_quantity = pd.concat([final_tweet_quantity,subset])

final_tweet_quantity

Unnamed: 0,Date,Tweet Volume,Metric
0,2022-04-21,2.00,Daily Volume
1,2022-04-22,1.00,Daily Volume
2,2022-04-23,1.00,Daily Volume
3,2022-04-24,1.00,Daily Volume
4,2022-04-25,7.00,Daily Volume
...,...,...,...
336,2023-04-15,654.57,7 Day Rolling Tweet Rate
337,2023-04-16,676.43,7 Day Rolling Tweet Rate
338,2023-04-17,715.00,7 Day Rolling Tweet Rate
339,2023-04-18,745.57,7 Day Rolling Tweet Rate


### Chart Config

In [4]:
selection = alt.selection_multi(fields=['Metric'], bind='legend')
selection2 = alt.selection_multi(fields=['Metric'], bind='legend')
opacity_value = 0.8

chart = alt.Chart(final_tweet_quantity,title = '#GenerativeAI Tweet Volume Over Time').mark_line().encode(
    x = alt.X('Date'),
    y = alt.Y('Tweet Volume',title = 'Tweet Volume'),
    color = alt.Color('Metric',scale=alt.Scale(
        domain=['Daily Volume', '7 Day Rolling Tweet Rate','Average Tweet Rate','ChatGPT Release','GPT 4 Release'],
        range=['lightblue', 'blue','orange','black','grey'])),
    tooltip = ['Date','Tweet Volume','Metric'],
    opacity = alt.condition(selection, alt.value(opacity_value), alt.value(0.1))
).interactive()

chart = chart.add_selection(selection)

products = pd.DataFrame({
  'Date': ['2022-11-30', '2023-03-14'],
  'Metric': ['ChatGPT Release','GPT 4 Release']
})

products['Date'] = pd.to_datetime(products['Date'])

chatGPT_release = alt.Chart(products).mark_rule().encode(
  x='Date:T',
  color=alt.Color('Metric', scale=alt.Scale(
        domain=['ChatGPT Release', 'GPT 4 Release'],
        range=['black', 'gray'])),
    opacity = alt.condition(selection2, alt.value(opacity_value), alt.value(0.1)),
    tooltip = ['Date',alt.Tooltip('Metric', title = 'Product Release')]
).interactive()

chatGPT_release = chatGPT_release.add_selection(selection2)

### Chart

In [5]:
chart + chatGPT_release

# 4. Sentiment Over Time
Hypothesis: Sentiment towards generative ai has improved over time, especially since the chatGPT release

### Data Prep

In [6]:
def scorer_nn1(ytrue, ypred):
    
    ytrue = ytrue.numpy()
    ypred = (ypred.numpy()).argmax(axis = 1)
    
    confuse = confusion_matrix(ytrue, ypred)
    
    score = (confuse.diagonal() / confuse.sum(axis = 1)).mean()
    
    if pd.isnull(score):
        score = 0
    return score

model = k.models.load_model('../sentiment_analysis/EmbeddingModels/TweetSentimentBinary1.h5', compile = True,
                   custom_objects = {'scorer_nn1':scorer_nn1})
tweet_vectors = pickle.load(open('../../Vectorize Tweets/genai_tweet_embeddings.pkl','rb'))
probs = model.predict(tweet_vectors)[:,1].flatten()

2023-07-09 21:18:18.210604: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.




In [7]:
df['Sentiment Score'] = 2*probs - 1
df

Unnamed: 0,Datetime,Tweet Id,Text,Username,Date,Sentiment Score
0,2023-04-19 21:27:19+00:00,1648800467206672384,From Studio Gangster to Synthetic Gangster 🎤.....,resembleai,2023-04-19,0.251432
1,2023-04-19 21:27:09+00:00,1648800425540476929,Took me some time to find this. I build this #...,devaanparbhoo,2023-04-19,-0.358057
2,2023-04-19 21:26:57+00:00,1648800376479715328,Mind blowing next wave #generativeai platform...,timreha,2023-04-19,0.820966
3,2023-04-19 21:26:49+00:00,1648800341193027584,Open Source Generative AI Image Specialist Sta...,VirtReview,2023-04-19,0.907774
4,2023-04-19 21:25:00+00:00,1648799883934203905,Are you an #HR leader considering which future...,FrozeElle,2023-04-19,0.879874
...,...,...,...,...,...,...
56216,2022-04-24 16:40:01+00:00,1518268535276904448,"Understanding Generative AI, Its Impacts and L...",analyticsinme,2022-04-24,0.074315
56217,2022-04-23 07:23:24+00:00,1517766068592381952,Y ya puedes empezar a crear #arte con @thegeni...,iia_es,2022-04-23,0.437956
56218,2022-04-22 08:20:21+00:00,1517418013812830208,"NVIDIA researchers have developed GANverse3D, ...",VideoGenAI,2022-04-22,0.332284
56219,2022-04-21 13:15:21+00:00,1517129866403008512,Tech Trend 2022: เทรนด์เทคโนโลยีสำหรับปี 2022 ...,sitthinuntp,2022-04-21,0.743980


In [8]:
tweet_quantity = df.groupby('Date').count()['Datetime']
tweet_quantity.name = 'Tweet Quantity'

daily_sentiment = df.groupby('Date').sum()['Sentiment Score']
daily_sentiment.name = 'Raw Daily Sentiment'

daily_sentiment_scaled = daily_sentiment/tweet_quantity
daily_sentiment_scaled.name = 'Daily Sentiment Score'

avg_sentiment = np.cumsum(daily_sentiment)/np.cumsum(tweet_quantity)
avg_sentiment.name = 'Average Sentiment Over Time'

sentiment_df = round(pd.concat([daily_sentiment_scaled, avg_sentiment], axis = 1),2)
sentiment_df

Unnamed: 0_level_0,Daily Sentiment Score,Average Sentiment Over Time
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-04-21,0.79,0.79
2022-04-22,0.33,0.64
2022-04-23,0.44,0.59
2022-04-24,0.07,0.49
2022-04-25,0.72,0.62
...,...,...
2023-04-15,0.57,0.58
2023-04-16,0.58,0.58
2023-04-17,0.53,0.58
2023-04-18,0.57,0.58


In [9]:
final_sentiment_df = pd.DataFrame()

for col in ['Daily Sentiment Score','Average Sentiment Over Time']:
    
    subset = sentiment_df[[col]].copy()
    subset.columns = ['Sentiment Score']
    subset['Score Type'] = col
    final_sentiment_df = pd.concat([final_sentiment_df,subset])

final_sentiment_df.reset_index(inplace = True)
final_sentiment_df['Metric'] = final_sentiment_df.apply(lambda x: 'Avg. Daily Sentiment Score - Positive'
                                                       if x['Sentiment Score'] >= 0 and x['Score Type'] == 'Daily Sentiment Score'
                                                       else 'Avg. Daily Sentiment Score - Negative' if x['Sentiment Score'] <= 0 and 
                                                       x['Score Type'] == 'Daily Sentiment Score' else 'Average Daily Sentiment Over Time',
                                                       axis = 1)

final_sentiment_df

Unnamed: 0,Date,Sentiment Score,Score Type,Metric
0,2022-04-21,0.79,Daily Sentiment Score,Avg. Daily Sentiment Score - Positive
1,2022-04-22,0.33,Daily Sentiment Score,Avg. Daily Sentiment Score - Positive
2,2022-04-23,0.44,Daily Sentiment Score,Avg. Daily Sentiment Score - Positive
3,2022-04-24,0.07,Daily Sentiment Score,Avg. Daily Sentiment Score - Positive
4,2022-04-25,0.72,Daily Sentiment Score,Avg. Daily Sentiment Score - Positive
...,...,...,...,...
677,2023-04-15,0.58,Average Sentiment Over Time,Average Daily Sentiment Over Time
678,2023-04-16,0.58,Average Sentiment Over Time,Average Daily Sentiment Over Time
679,2023-04-17,0.58,Average Sentiment Over Time,Average Daily Sentiment Over Time
680,2023-04-18,0.58,Average Sentiment Over Time,Average Daily Sentiment Over Time


In [10]:
final_sentiment_df['Tweet Volume'] = np.array(pd.concat([tweet_quantity,np.cumsum(tweet_quantity)]))
final_sentiment_df

Unnamed: 0,Date,Sentiment Score,Score Type,Metric,Tweet Volume
0,2022-04-21,0.79,Daily Sentiment Score,Avg. Daily Sentiment Score - Positive,2
1,2022-04-22,0.33,Daily Sentiment Score,Avg. Daily Sentiment Score - Positive,1
2,2022-04-23,0.44,Daily Sentiment Score,Avg. Daily Sentiment Score - Positive,1
3,2022-04-24,0.07,Daily Sentiment Score,Avg. Daily Sentiment Score - Positive,1
4,2022-04-25,0.72,Daily Sentiment Score,Avg. Daily Sentiment Score - Positive,7
...,...,...,...,...,...
677,2023-04-15,0.58,Average Sentiment Over Time,Average Daily Sentiment Over Time,53119
678,2023-04-16,0.58,Average Sentiment Over Time,Average Daily Sentiment Over Time,53650
679,2023-04-17,0.58,Average Sentiment Over Time,Average Daily Sentiment Over Time,54450
680,2023-04-18,0.58,Average Sentiment Over Time,Average Daily Sentiment Over Time,55335


### Chart Config

In [11]:
selection = alt.selection_multi(fields=['Metric'], bind='legend')

chart1 = alt.Chart(final_sentiment_df[final_sentiment_df['Score Type'] == 'Daily Sentiment Score']
                  ,title = '#GenerativeAI Tweet Sentiment Over Time').mark_bar().encode(
    x = alt.X('Date'),
    y = alt.Y('Sentiment Score',title = 'Sentiment Score'),
    color = alt.Color('Metric', scale = alt.Scale(
    domain = ['Avg. Daily Sentiment Score - Positive','Avg. Daily Sentiment Score - Negative','Average Daily Sentiment Over Time'],
    range = ['#1f77b4','#ff7f0e','black'])),
    tooltip = ['Date',alt.Tooltip('Sentiment Score',title = 'Daily Sentiment Score'), 'Tweet Volume'],
    opacity = alt.condition(selection, alt.value(opacity_value), alt.value(0.1))
).interactive()

chart1 = chart1.add_selection(selection)

chart2 = alt.Chart(final_sentiment_df[final_sentiment_df['Score Type'] == 'Average Sentiment Over Time'],
                   title = '#GenerativeAI Tweet Sentiment Over Time').mark_line(color = 'black').encode(
    x = alt.X('Date'),
    y = alt.Y('Sentiment Score',title = 'Sentiment Score'),
    tooltip = ['Date',alt.Tooltip('Sentiment Score',title = 'Average Sentiment Over Time'),'Tweet Volume'],
).interactive()

chart = chart1 + chart2

### Chart

In [12]:
chart

# --- Popular Hashtags Over Time ---

# 5. Popular Hashtags

### Data Prep

#### Create Additional Attributes to Augment Tweet Embeddings (Sentiment, Topics, ETC)

In [13]:
tweet_vector_df = pd.DataFrame(tweet_vectors).astype('float32')
tweet_vector_df.index = df['Username']
n,p = tweet_vector_df.shape

tweet_vector_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
Username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
resembleai,-0.083710,-0.033990,0.000476,-0.048581,0.031476,0.023073,0.015883,-0.065359,-0.028512,-0.041127,...,0.095273,-0.062914,-0.047609,0.068421,-0.014478,-0.032130,0.084182,-0.023094,0.143637,-0.020737
devaanparbhoo,-0.139654,-0.043269,-0.017897,-0.021135,-0.015833,-0.033197,-0.097767,-0.045126,-0.062690,-0.015131,...,0.103957,0.001434,-0.010707,-0.071154,0.024355,0.008581,0.074046,0.016017,0.029310,0.022079
timreha,-0.063041,0.040816,0.016180,-0.080494,0.076523,-0.044910,0.046102,0.002218,-0.019142,-0.038368,...,0.095244,0.025227,-0.003190,0.013860,0.024257,0.032297,0.049790,0.042318,-0.024372,0.011937
VirtReview,-0.033588,-0.082275,-0.004023,0.020607,0.061441,-0.030838,-0.049936,0.005418,-0.023266,-0.046210,...,0.072074,-0.045915,0.085395,-0.046842,0.068073,0.081211,0.014357,0.045136,0.036358,-0.063432
FrozeElle,-0.055022,-0.037315,-0.001191,-0.000298,0.018914,0.027899,-0.037156,-0.007534,-0.023039,0.017566,...,0.086528,0.070066,0.017756,-0.051270,-0.047398,0.015272,0.125480,-0.143145,-0.037588,0.059479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
analyticsinme,-0.088851,-0.053970,0.021771,0.045861,0.052610,-0.016610,-0.037461,-0.018470,-0.036792,-0.034326,...,0.070930,-0.029019,0.031190,-0.007116,0.040318,0.000708,0.054249,0.022755,0.009548,-0.024321
iia_es,-0.073429,0.017285,0.055224,-0.037934,0.020334,0.031867,-0.045801,0.057453,-0.032207,0.005512,...,0.003128,-0.033143,0.043856,-0.051575,-0.044059,-0.044657,0.080109,0.044912,0.051408,-0.007229
VideoGenAI,-0.054944,-0.073963,-0.027312,-0.027142,0.001293,-0.080407,-0.159834,0.010603,-0.035030,-0.009567,...,0.018130,0.025474,0.045066,-0.020768,0.034581,0.007664,0.012724,-0.001266,0.009509,0.030493
sitthinuntp,-0.108644,0.045288,0.000328,-0.049945,0.011732,-0.078336,-0.016034,0.003274,-0.029378,0.024841,...,0.037495,0.039859,-0.024015,-0.052795,-0.018854,0.001163,0.089746,-0.052977,-0.018076,-0.040114


In [14]:
tweet_vector_df['Sentiment Score'] = np.array(df['Sentiment Score']).astype('float32')

#### Identify Useful Keywords (Hashtags, Key Topics) 

In [15]:
# Hashtag identification

def get_hashtags(x):
    x = str(x).lower()
    
    #replace characters with spaces for text preprocessing
    replace_chars = list("!@$%^&*()-_=+[]\{}|;:',./<>?`~")
    replace_chars.append('\n')
    
    for char in replace_chars:
        x = x.replace(char,' ')
    
    #replace double sapces
    x = x.replace('  ',' ')
    
    #ID hashtags
    hashtags = [tag for tag in x.split() if tag[0] == '#']
    
    #ID hashtags within hashtags
    hashtags = list(itertools.chain.from_iterable([[y for y in x.split('#') if y != ''] for x in hashtags]))
    
    #Get single list of hashtags
    hashtags = [f'#{x}' for x in hashtags]
    
    return hashtags

df['Hashtags'] = df['Text'].apply(lambda x:get_hashtags(x))

#unique hashtags
unique_hashtags = list(set(itertools.chain.from_iterable(list(df['Hashtags']))))

#### Generate binary attributes for each tweet that assess whether or not a hashtag appeared

In [16]:
run = False

In [17]:
# Binary indicators for each tweet that show whether or not a specific hashtag appeared in it
if run:
    count = 0
    hashtag_indicators = pd.DataFrame()
    for col in unique_hashtags:
        hashtag_indicators[col] = df['Hashtags'].apply(lambda x: True if col in x else False)
        count = count + 1
        if count%500 == 0:
            print(f'{count}/{len(unique_hashtags)} Complete')

#### Observe Root Hashtags and see if any additional hashtags should be eliminated (hashtag that exists within another such as #stablediffusion --> #stablediffusion2), Roots are preserved if length of root > 7 characters (smaller hashtags such as #genai could result in overelimination of other hashtags that are still highly informative

In [18]:
roots = {}
all_stems = []
for col in unique_hashtags:
    if col not in all_stems and '#gen' not in col:
        stems = [x for x in unique_hashtags if x != col and col in x]
        if len(stems) != 0:
            roots[col] = stems
        all_stems.extend(stems)

In [19]:
# Prune Roots if len(key) <= 7
root_keys = list(roots.keys()).copy()
for key in root_keys:
    if len(key) <= 7:
        roots.pop(key)

#### Propogate hashtag occurrences from stem into roots and eliminate stem hashtags

In [20]:
# Combine hashtag results where roots absorb stem hashtags
if run:
    drop_list = []

    for key,value in roots.items():

        hashtag_indicators[key] = (hashtag_indicators[key] + (hashtag_indicators[value].sum(axis = 1))) >=1
        drop_list.extend(value)

    drop_list = list(set(drop_list))
    hashtag_indicators.drop(drop_list, axis = 1, inplace = True)
    hashtag_indicators.info()

#### Identify subset of hashtags that appear in a high volume of tweets -- approach: observe a "scree plot" of # of hashtags that appear in at least x amount of tweets across variable quantities of x, look for natural break points

In [21]:
if run:
    hashtag_counts = hashtag_indicators.sum(axis = 0)

    tweet_thresh = []
    n_hashtags = []

    for num in range(1,1001):
        tweet_thresh.append(num)
        n_hashtags.append((hashtag_counts >= num).sum())

    scree_df = pd.DataFrame()
    scree_df['Tweet Threshold'] = tweet_thresh
    scree_df['Number of Hashtags'] = n_hashtags

    alt.Chart(scree_df, title = 'Number of Hashtags Present in At Least X Tweets').mark_line().encode(
        alt.X('Tweet Threshold'),
        alt.Y('Number of Hashtags'),
        tooltip = ['Tweet Threshold','Number of Hashtags']).interactive()

#### Only preserve hashtags that appear in at least 100 tweets, and top 1000 hashtags, add hashtag attributes for those that appear in at least 100 tweets to tweet embeddings

In [22]:
if run:
    top1000 = hashtag_counts.sort_values(ascending = False).head(1000)
    over100 = top1000[top1000 >= 100]

    display('Top 1000 Hashtags',top1000)
    display('Hashtags that appear in at least 100 tweets',over100)

    top1000_hashtags = list(top1000.index)
    over100_hashtags = list(over100.index)

    hashtag_indicators_top1000 = hashtag_indicators[top1000_hashtags].astype('int8')
    hashtag_indicators_over100 = hashtag_indicators[over100_hashtags].astype('int8')
    pickle.dump(hashtag_indicators_top1000,open('top1000_hashtags.pkl','wb'))
    pickle.dump(hashtag_indicators_over100,open('hashtags_over100tweets.pkl','wb'))

In [23]:
hashtag_indicators_top1000 = pickle.load(open('top1000_hashtags.pkl','rb'))
hashtag_indicators_over100 = pickle.load(open('hashtags_over100tweets.pkl','rb'))

In [24]:
#https://pypi.org/project/altair-data-server/

In [25]:
preserved_hashtags = list(hashtag_indicators_over100.columns)
hashtag_indicators = hashtag_indicators_over100.copy()
hashtag_indicators = hashtag_indicators.astype('int8')
hashtag_indicators

Unnamed: 0,#generativeai,#ai,#chatgpt,#aiart,#generativeart,#stablediffusion,#artificialintelligence,#machinelearning,#midjourney,#openai,...,#drugdiscovery,#fineart,#gans,#ecommerce,#bardai,#bigtech,#news,#moe2023,#employee,#sales
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56216,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
56217,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
56218,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
56219,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
tweet_vector_df[preserved_hashtags] = np.array(hashtag_indicators)
tweet_vector_df

  self[col] = igetitem(value, i)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,#drugdiscovery,#fineart,#gans,#ecommerce,#bardai,#bigtech,#news,#moe2023,#employee,#sales
Username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
resembleai,-0.083710,-0.033990,0.000476,-0.048581,0.031476,0.023073,0.015883,-0.065359,-0.028512,-0.041127,...,0,0,0,0,0,0,0,0,0,0
devaanparbhoo,-0.139654,-0.043269,-0.017897,-0.021135,-0.015833,-0.033197,-0.097767,-0.045126,-0.062690,-0.015131,...,0,0,0,0,0,0,0,0,0,0
timreha,-0.063041,0.040816,0.016180,-0.080494,0.076523,-0.044910,0.046102,0.002218,-0.019142,-0.038368,...,0,0,0,0,0,0,0,0,0,0
VirtReview,-0.033588,-0.082275,-0.004023,0.020607,0.061441,-0.030838,-0.049936,0.005418,-0.023266,-0.046210,...,0,0,0,0,0,0,0,0,0,0
FrozeElle,-0.055022,-0.037315,-0.001191,-0.000298,0.018914,0.027899,-0.037156,-0.007534,-0.023039,0.017566,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
analyticsinme,-0.088851,-0.053970,0.021771,0.045861,0.052610,-0.016610,-0.037461,-0.018470,-0.036792,-0.034326,...,0,0,0,0,0,0,0,0,0,0
iia_es,-0.073429,0.017285,0.055224,-0.037934,0.020334,0.031867,-0.045801,0.057453,-0.032207,0.005512,...,0,0,0,0,0,0,0,0,0,0
VideoGenAI,-0.054944,-0.073963,-0.027312,-0.027142,0.001293,-0.080407,-0.159834,0.010603,-0.035030,-0.009567,...,0,0,0,0,0,0,0,0,0,0
sitthinuntp,-0.108644,0.045288,0.000328,-0.049945,0.011732,-0.078336,-0.016034,0.003274,-0.029378,0.024841,...,0,0,0,0,0,0,0,0,0,0


#### Popular Hashtags Over Time

In [27]:
hashtags_over_time = hashtag_indicators_top1000.copy()
hashtags_over_time.index = df['Date'].apply(lambda x: f'{x.month}-{x.year}')
hashtags_over_time.index.name = 'MonthYear'
hashtags_over_time

Unnamed: 0_level_0,#generativeai,#ai,#chatgpt,#aiart,#generativeart,#stablediffusion,#artificialintelligence,#machinelearning,#midjourney,#openai,...,#女子高生の日常,#europe,#jkブランド,#datascientis…,#tts,#thecube,#envision,#twitch,#waxcommunity,#aiapplications
MonthYear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4-2023,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4-2023,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4-2023,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4-2023,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4-2023,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4-2022,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4-2022,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4-2022,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4-2022,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
def top_tags(x):
    
    hashtag_sums = x.sum(axis = 0).sort_values(ascending = False)
    hashtag_sums = hashtag_sums.iloc[1:51]
    hashtag_rates = round(100*hashtag_sums / len(x),2)
    
    return hashtag_rates

In [29]:
hashtags_over_time = pd.DataFrame(hashtags_over_time.groupby('MonthYear').apply(lambda x: top_tags(x)).loc[[
    '4-2022',
    '5-2022',
    '6-2022',
    '7-2022',
    '8-2022',
    '9-2022',
    '10-2022',
    '11-2022',
    '12-2022',
    '1-2023',
    '2-2023',
    '3-2023',
    '4-2023'
]]).reset_index()

hashtags_over_time.columns = ['MonthYear','Hashtags','Hashtag Rate (%)']

hashtags_over_time

Unnamed: 0,MonthYear,Hashtags,Hashtag Rate (%)
0,4-2022,#ai,45.83
1,4-2022,#artificialintelligence,16.67
2,4-2022,#machinelearning,16.67
3,4-2022,#fintech,12.50
4,4-2022,#aiart,12.50
...,...,...,...
645,4-2023,#futureofwork,1.19
646,4-2023,#techtrends,1.18
647,4-2023,#cybersecurity,1.17
648,4-2023,#startups,1.13


In [30]:
np.random.seed(50)
unq_hashes = list(hashtags_over_time['Hashtags'].unique())
color_codes = list([f'RGB({np.random.randint(0,256)},{np.random.randint(0,256)},{np.random.randint(0,256)})'
                   for x in unq_hashes])

### Chart Config

In [31]:
charts = []

for month_year in hashtags_over_time['MonthYear'].unique():

    subset = hashtags_over_time[hashtags_over_time['MonthYear'] == month_year].copy()
    chart = alt.Chart(subset, title = f'Popular Hashtags Over Time ({subset["MonthYear"].iloc[0]})').mark_bar(color = 'RGB(200,0,50)').encode(
        x = alt.X('Hashtags',sort = '-y'),
        y = alt.Y('Hashtag Rate (%)'),
        color = alt.Color('Hashtags', scale = alt.Scale(
            domain = unq_hashes,
            range = color_codes
        )),
        tooltip = [alt.Tooltip('Hashtags', title = 'Hashtag'), alt.Tooltip('Hashtag Rate (%)')]).interactive()

    charts.append(chart)

### Charts 

#### I THINK THIS WILL BE BETTER AS A TABLEAU CHART USING TABLEAU PAGES TO BETTER SHOW ANIMATIONS)

In [44]:
for chart in charts:
    display(chart)
    time.sleep(2)
    clear_output(wait=True)

# --- Network Graphs, Identify Relationship Between Content ---

- What Hashtags have a tendency to be used together
- Are there natural clusters of similars tweets and users that share similar content (sentiment, hashtag usage, semantic meaning of tweets)

# 6. Hashtag Co-Occurence

### Data Prep

#### Identify Hashtags that co-occur --> Co-Occurence Rate Matrix Used as Adjacency Matrix

In [33]:
# Adjacency Matrix
adjacency = pd.DataFrame(np.zeros((1000,1000)), index = hashtag_indicators_top1000.columns, columns = 
                                 hashtag_indicators_top1000.columns)


for col in adjacency.columns:
    
    #Find tweets where that hashtag occurred
    subset = hashtag_indicators_top1000[hashtag_indicators_top1000[col] == 1].copy()
    
    #Find Occurences Rate of other hashtags, conditional on that hashtag occuring
    counts = subset.mean(axis = 0)
    counts.loc[col] = 0
    
    adjacency[col] = list(counts)

adjacency

Unnamed: 0,#generativeai,#ai,#chatgpt,#aiart,#generativeart,#stablediffusion,#artificialintelligence,#machinelearning,#midjourney,#openai,...,#女子高生の日常,#europe,#jkブランド,#datascientis…,#tts,#thecube,#envision,#twitch,#waxcommunity,#aiapplications
#generativeai,0.000000,0.993825,0.992983,0.999405,0.999876,0.999610,0.994902,0.995574,0.998411,0.997835,...,1.0,1.000000,1.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.000000
#ai,0.386017,0.000000,0.509250,0.420557,0.437917,0.418066,0.666909,0.779819,0.341807,0.518404,...,0.0,0.684211,0.0,0.263158,0.631579,0.368421,0.000000,0.789474,0.0,0.736842
#chatgpt,0.195031,0.257512,0.000000,0.058380,0.037101,0.028635,0.376056,0.336800,0.164321,0.674296,...,0.0,0.368421,0.0,0.263158,0.052632,0.210526,0.421053,0.000000,0.0,0.473684
#aiart,0.180479,0.195530,0.053677,0.000000,0.832179,0.760901,0.116953,0.163753,0.517930,0.111970,...,0.0,0.000000,0.0,1.000000,0.000000,0.000000,0.000000,0.157895,0.0,0.000000
#generativeart,0.144716,0.163180,0.027340,0.666964,0.000000,0.686320,0.092630,0.111308,0.255334,0.063409,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
#thecube,0.000340,0.000323,0.000365,0.000000,0.000000,0.000000,0.000146,0.000000,0.000000,0.000619,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
#envision,0.000340,0.000000,0.000729,0.000000,0.000000,0.000260,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
#twitch,0.000340,0.000691,0.000000,0.000297,0.000000,0.001822,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.052632,0.000000,0.000000,0.000000,0.0,0.000000
#waxcommunity,0.000340,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.004312,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


#### Find Communities (clusters of hashtags that co-occur) Using a DBSCAN algorithm, Preserve NonOutlier Hashtags

In [34]:
def dbscan(D, min_pts, threshold, random_state, max_iter):
    
    #Capture shape of incoming adjacency / distance matrix
    n,p = D.shape
    
    #Results DataFrame storing cluster assignments
    results = pd.DataFrame(np.zeros((n,5)), columns = 
                           ['Visited','Point Type','Assignment','Co-Occurence to Nearest','Numerical Index'])
    
    #Instantiate results values and select starting point
    results['Visited'] = False
    results['Point Type'] = 'Outlier'
    results['Assignment'] = 'Outlier'
    results['Co-Occurence to Nearest'] = np.inf
    results['Numerical Index'] = range(n)
    results.index = D.index
    
    np.random.seed(random_state)
    visiting_point = np.random.randint(1,n)
    
    #prevent visitation to generative ai hashtag
    results.iloc[0,0] = True
    results.iloc[0,1] = '#generativeai'
    results.iloc[0,2] = '#generativeai'
    
    # Visit point, see if there are at least min_pts within the threshold, if so, label as core point and propogate
    # cluster label, if not, visit nearest unvisited point. Continue until all points have been visited
    stop = False
    stop_count = 0
    cluster_count = 0
    global_indexer = np.array(D.index != '#generativeai')
    
    while stop == False:
        
        #Go to visited point
        subset = D.iloc[:,visiting_point].copy()
        
        #Get hashtags that co-occur at least threshold % of the time
        within_thresh = subset[subset >= threshold].copy()
        
        try:
            within_thresh.drop('#generativeai',inplace = True)
        except:
            pass
        
        if len(within_thresh) < min_pts: # if point is not a core point
                        
            if results.iloc[visiting_point, 1] == 'Outlier':# point has not already been assigned to a cluster
                results.iloc[visiting_point, -2] = subset[global_indexer].max()
        
        else: #point is a core point
            
            # Identify label to propogate to other points
            if results.iloc[visiting_point, 2] == 'Outlier':
                cluster_count = cluster_count + 1
                label = f'Cluster {cluster_count}'
            else:
                label = results.iloc[visiting_point, 2]
            
            results.iloc[visiting_point, -2] = 0
            results.iloc[visiting_point, 1] = 'Core'
            results.iloc[visiting_point, 2] = label
                        
            #Update assignments to points within threshold as necessary (label propogation step)
            for index, occurence in zip(within_thresh.index, list(within_thresh)):
                if (results.loc[index, 'Point Type'] == 'Outlier') or (
                    results.loc[index, 'Point Type'] == 'Border' and 
                    results.loc[index, 'Co-Occurence to Nearest'] < occurence):
                    
                    results.loc[index, 'Point Type'] = 'Border'
                    results.loc[index, 'Assignment'] = label
                    results.loc[index, 'Co-Occurence to Nearest'] = occurence
        
        results.iloc[visiting_point, 0] = True
        
        #Identify new point to visit
        indexer = np.array(results['Visited'] == False)
        
        if indexer.sum() == 0:
            stop_count = stop_count + 1
            #Reset Small Clusters
            current_clust = results[results['Assignment'].apply(lambda x: True if 'Cluster' in x else True if 'Outlier' in x else False)]['Assignment']
            current_clust = current_clust.value_counts()
            reset_clusts = list(current_clust[current_clust < min_pts].index)
            
            if stop_count <= max_iter:
                for num in range(n):
                    if results.iloc[num,2] in reset_clusts:
                        results.iloc[num,0] = False
                        results.iloc[num,1] = 'Outlier'
                        results.iloc[num,2] = 'Outlier'
                        results.iloc[num,3] = np.inf
        
        
        indexer = np.array(results['Visited'] == False)
        if stop_count >= max_iter:
            stop = True
        else: #visit unvisited point with highest co-occurence
            un_visited = subset[indexer]
            visiting_point = results.loc[un_visited[un_visited == un_visited.max()].index[0],'Numerical Index']
    
    #Rename Clusters
    results['Assignment'].unique()
    
    renames = list(results['Assignment'].unique()).copy()
    c_names = [int(x.split(' ')[1]) for x in renames if ' ' in x]
    no_c_names = [x for x in renames if ' ' not in x]
    c_names.sort()
    
    new_names = {}
    cluster_count = 1
    for name in c_names:
        new_names[f'Cluster {name}'] = f'Cluster {cluster_count}'
        cluster_count = cluster_count + 1
    
    for name in no_c_names:
        new_names[name] = name
    
    results['Assignment'] = results['Assignment'].map(new_names)
    
    return results

In [35]:
results = dbscan(adjacency,5, 0.2, 50,5)
clustered_results = results[results['Assignment'] != 'Outlier'].copy()
adjacency_copy = adjacency.loc[clustered_results.index, clustered_results.index]

#### Find additional Hashtags to Prune

In [36]:
thresh = 0.5

connections = pd.DataFrame(
    np.array([0 if x <= thresh else 1 for x in np.array(adjacency_copy.copy()).flatten()]).reshape(adjacency_copy.shape),
    index = adjacency_copy.index,
    columns = adjacency_copy.columns)

prune_network = list(connections[connections.mean(axis = 1) != 0].index)
weights = adjacency_copy.loc[prune_network,prune_network].copy()

#### Baseline Network

In [37]:
#Create Network Graph
G = nx.from_pandas_adjacency(weights)
# Compute positions for viz.
pos = nx.spring_layout(G)
#centrality helper
helper = weights.mean(axis = 1)
helper_array = np.array(helper)

for n in G.nodes():
    G.nodes[n]['Hashtag'] = n
    G.nodes[n]['Community'] = clustered_results.loc[n,'Assignment']
    G.nodes[n]['Relative Centrality'] = round(100*(helper_array <= helper[n]).mean(),2)

### Chart Config

In [38]:
alt.data_transformers.disable_max_rows()

viz = nxa.draw_networkx(G,
                        pos=pos,
                        node_tooltip=['Hashtag', 'Community','Relative Centrality'],
                        node_color = 'Community:N',
                        node_label = 'Hashtag',
                        width = 'weight',
                        cmap = 'accent',
                        #node_size = 'Relative Centrality',
                        font_size = 8,
                        edge_color = 'lightgrey'
                       )

# Show it as an interactive plot!
viz = viz.interactive().properties(height = 800, width = 800,title = {'text':['Interaction Between Popular Hashtags',
                                                                              'Color-Coded by Hashtag Community']})

viz = viz.configure_title(
    fontSize=15,
    font='Courier',
    anchor='start',
    color='gray'
)

selection = alt.selection_single(on='mouseover', fields=['Hashtag'], empty='none')

viz = viz.encode(
    opacity = alt.condition(selection, alt.value(0.5), alt.value(1)),
).add_selection(selection)

viz = viz.configure_legend(disable = True)

In [40]:
chart = viz

### Chart

In [41]:
chart