<a href="https://colab.research.google.com/github/caiocmello/netzdg/blob/main/NetzDG_colours.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook has been built to analyse orange, red, green and grey clusters

Dataset: df_colours.csv



### Install and import: Run this cell to load the data

In [None]:
# @title Load Libraries and Data
import pandas as pd
import nltk
import spacy
import plotly.express as px
import plotly.graph_objects as go
import re
import os
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from spacy.lang.en.stop_words import STOP_WORDS as en_stopwords
from spacy.lang.de.stop_words import STOP_WORDS as de_stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Load the data frame into colab
df = pd.read_csv('df_colours.csv')
df = df[['tweet_id', 'user_username', 'text', 'retweet_count', 'like_count', 'created_at', 'cluster']]
df.set_index('tweet_id', inplace=True)

In [None]:
# @title Run this cell to see the list of users in dataframe and the number of tweets they posted
df['user_username'].value_counts()

In [None]:
# @title Choose a user to see tweets written by them
user = "fdp" # @param {type:"string"} # Define word here

# See only tweets written by @user

def show_rows_with_user(df, user):

  return df[df['user_username'] == user]

result = show_rows_with_user(df, user)
result = result.sort_values(by=['retweet_count'], ascending=False)

result

### List of n-grams

* **user:** *user name*
* **cluster:** blue or purple
* **content:** all, per_user or per_cluster
* **items_in_list:** *number*
* **see_top_words:** unigrams or bigrams

In [None]:
# @title Set parameters to see list of n-grams


df_NetzClean = df.copy()

# Remove underscore from tweets (To avoid erros in deleting users like @balzer_sascha)

def cleaner(text):
    text = re.sub(r"_", "", text) # Remove underscore
    return text
df_NetzClean['text_clean'] = df_NetzClean['text'].map(lambda x: cleaner(x))

# Remove users, remove URLs, remove hashtag sign

def cleaner(text):
    text = re.sub("@[A-Za-z0-9]+","",text) # Remove @ sign
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text) # Remove http links
    text = " ".join(text.split())
    text = text.replace("#", "") # Remove hashtag sign but keep the text
    return text
df_NetzClean['text_clean'] = df_NetzClean['text'].map(lambda x: cleaner(x))

df_NetzClean = df_NetzClean.drop_duplicates(subset=['text']) #remove duplicated tweets in column 'text'

def get_top_n_words(corpus, stopwords, n=20):
    vec = CountVectorizer(stop_words = stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_bigram(corpus, stopwords, n=20):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words = stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_words_tfidf(corpus, stopwords, n=20):
    vec = TfidfVectorizer(stop_words = stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_bigram_tfidf(corpus, stopwords, n=20):
    vec = TfidfVectorizer(ngram_range=(2, 2), stop_words = stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

# Transform stopwords in a list

stopwords_de = list(de_stopwords)
stopwords_en = list(en_stopwords)

stopwords = stopwords_de + stopwords_en #create a mixed list of stopwords (German and English)

# Edit your list of stopwords manually

add_to_stopwords = ['rt']
stopwords = stopwords + add_to_stopwords

# Define content
user = 'fdp' # @param {type:"string"}
cluster = 'orange' # @param {type:"string"}
all = df_NetzClean[['text_clean']]
df_user = df_NetzClean[df_NetzClean['user_username'] == user]
per_user = df_user[['text_clean']]
df_cluster = df_NetzClean[df_NetzClean['cluster'] == cluster]
per_cluster = df_cluster[['text_clean']]

content = per_cluster # @param {type:"raw"}
content.reset_index(drop=True, inplace=True)
content = content.values.tolist()

string = '\n'.join(str(e) for e in content)

items_in_list = 10 # @param {type:"number"}

unigrams = get_top_n_words([string], stopwords=stopwords, n=items_in_list)
bigrams = get_top_n_bigram([string], stopwords=stopwords, n=items_in_list)
see_top_words = bigrams # @param {type:"raw"}

see_top_words

In [None]:
# @title Chart 1: Temporal distribution of tweets
df['created_at'] = df['created_at'].astype(str).str[:10]
df['created_at'] = pd.to_datetime(df['created_at'])

df_group = df.groupby('created_at')['text'].count().to_frame(name='count')
df_group = df_group.reset_index()

fig = go.Figure()
fig.add_trace(go.Bar(x=df_group['created_at'],
                y=df_group['count'],
                marker_color='rgb(55, 83, 109)'
                ))
fig.update_layout(title_text='Temporal distribution of tweets')

In [None]:
# @title Chart 2: Temporal distribution of tweets by month
df_month = df_group
df_month['created_at'] = df_month['created_at'].astype(str).str[:7]
df_month = df_month.groupby('created_at')['count'].sum().to_frame(name='count_month')
df_month = df_month.reset_index()
fig = go.Figure()
fig.add_trace(go.Bar(x=df_month['created_at'],
                y=df_month['count_month'],
                marker_color='rgb(55, 83, 109)'
                ))
fig.update_layout(title_text='Temporal distribution of tweets per month')

In [None]:
# @title Chart 3: Temporal distribution of tweets by user (per month)
user = 'fdp' # @param {type:"string"}
pd.set_option('mode.chained_assignment', None)
df_user = df[df['user_username'] == user]
df_user['created_at'] = df_user['created_at'].astype(str).str[:7]
df_user = df_user.groupby('created_at')['text'].count().to_frame(name='count')
df_user = df_user.reset_index()

fig = go.Figure()
fig.add_trace(go.Bar(x=df_user['created_at'],
                y=df_user['count'],
                marker_color='rgb(55, 83, 109)'
                ))
fig.update_layout(title_text='Temporal distribution of tweets for user: ' + user)

In [None]:
# @title Chart 4: Temporal distribution of tweets by cluster (per month)

df_orange = df[df['cluster'] == 'orange']
df_red = df[df['cluster'] == 'red']
df_green = df[df['cluster'] == 'green']
df_grey= df[df['cluster'] == 'gray']

df_orange['created_at'] = df_orange['created_at'].astype(str).str[:7]
df_red['created_at'] = df_red['created_at'].astype(str).str[:7]
df_green['created_at'] = df_green['created_at'].astype(str).str[:7]
df_grey['created_at'] = df_grey['created_at'].astype(str).str[:7]

df_orange = df_orange.groupby('created_at')['text'].count().to_frame(name='count')
df_red = df_red.groupby('created_at')['text'].count().to_frame(name='count')
df_green = df_green.groupby('created_at')['text'].count().to_frame(name='count')
df_grey = df_grey.groupby('created_at')['text'].count().to_frame(name='count')

df_orange = df_orange.reset_index()
df_red = df_red.reset_index()
df_green = df_green.reset_index()
df_grey = df_grey.reset_index()

fig = go.Figure()
fig.add_trace(go.Bar(x=df_orange['created_at'],
                y=df_orange['count'],
                marker_color='orange',
                ))
fig.add_trace(go.Bar(x=df_red['created_at'],
                y=df_red['count'],
                marker_color='red'
                ))
fig.add_trace(go.Bar(x=df_green['created_at'],
                y=df_green['count'],
                marker_color='green'
                ))
fig.add_trace(go.Bar(x=df_grey['created_at'],
                y=df_grey['count'],
                marker_color='grey'
                ))
fig.update_layout(title_text='Temporal distribution of tweets by cluster (per month)')

In [None]:
# @title Chart 5: Likes and Retweets of Tweets Over Time

fig = px.scatter(df, x="created_at", y="like_count", size="retweet_count", color="cluster", hover_name="user_username",
                 title="Likes and Retweets of Tweets Over Time")
fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Number of Likes",
    legend_title="Cluster",
)
fig.show()