<a href="https://colab.research.google.com/github/caiocmello/netzdg/blob/main/NetzDG_Analyser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NetzDG Analyser**
This notebook facilitates exploring the data on NetzDG data. Dataset to be used is 'df_netzdg_blue&purple.csv'


### Install and import: Run this cell to load the data

In [None]:
# @title Load Libraries and Data
import pandas as pd
import nltk
import spacy
import plotly.express as px
import plotly.graph_objects as go
import re
import os
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from spacy.lang.en.stop_words import STOP_WORDS as en_stopwords
from spacy.lang.de.stop_words import STOP_WORDS as de_stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Load the data frame into colab
url = 'df_netzdg_blue&purple.csv'
df = pd.read_csv(url,index_col=0)

In [None]:
df = pd.read_csv(url,index_col=0)

### Data filtering

In [None]:
# @title Run this cell to see the list of users in dataframe and the number of tweets they posted
df['user_username'].value_counts()

In [None]:
# @title Choose a user to see tweets written by them
user = "golem" # @param {type:"string"} # Define word here

# See only tweets written by @user

def show_rows_with_user(df, user):

  return df[df['user_username'] == user]

result = show_rows_with_user(df, user)
result = result.sort_values(by=['retweet_count'], ascending=False)

result

### List of n-grams

* **user:** *user name*
* **cluster:** blue or purple
* **content:** all, per_user or per_cluster
* **items_in_list:** *number*
* **see_top_words:** unigrams or bigrams

In [None]:
# @title Set parameters to see list of n-grams


df_NetzClean = df.copy()

# Remove underscore from tweets (To avoid erros in deleting users like @balzer_sascha)

def cleaner(text):
    text = re.sub(r"_", "", text) # Remove underscore
    return text
df_NetzClean['text_clean'] = df_NetzClean['text'].map(lambda x: cleaner(x))

# Remove users, remove URLs, remove hashtag sign

def cleaner(text):
    text = re.sub("@[A-Za-z0-9]+","",text) # Remove @ sign
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text) # Remove http links
    text = " ".join(text.split())
    text = text.replace("#", "") # Remove hashtag sign but keep the text
    return text
df_NetzClean['text_clean'] = df_NetzClean['text'].map(lambda x: cleaner(x))

df_NetzClean = df_NetzClean.drop_duplicates(subset=['text']) #remove duplicated tweets in column 'text'

def get_top_n_words(corpus, stopwords, n=20):
    vec = CountVectorizer(stop_words = stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_bigram(corpus, stopwords, n=20):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words = stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_words_tfidf(corpus, stopwords, n=20):
    vec = TfidfVectorizer(stop_words = stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_bigram_tfidf(corpus, stopwords, n=20):
    vec = TfidfVectorizer(ngram_range=(2, 2), stop_words = stopwords).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

# Transform stopwords in a list

stopwords_de = list(de_stopwords)
stopwords_en = list(en_stopwords)

stopwords = stopwords_de + stopwords_en #create a mixed list of stopwords (German and English)

# Edit your list of stopwords manually

add_to_stopwords = ['rt']
stopwords = stopwords + add_to_stopwords

# Define content
user = 'netzpolitik' # @param {type:"string"}
cluster = 'blue' # @param {type:"string"}
all = df_NetzClean[['text_clean']]
df_user = df_NetzClean[df_NetzClean['user_username'] == user]
per_user = df_user[['text_clean']]
df_cluster = df_NetzClean[df_NetzClean['cluster'] == cluster]
per_cluster = df_cluster[['text_clean']]

content = all # @param {type:"raw"}
content.reset_index(drop=True, inplace=True)
content = content.values.tolist()

string = '\n'.join(str(e) for e in content)

items_in_list = 10 # @param {type:"number"}

unigrams = get_top_n_words([string], stopwords=stopwords, n=items_in_list)
bigrams = get_top_n_bigram([string], stopwords=stopwords, n=items_in_list)
see_top_words = bigrams # @param {type:"raw"}

see_top_words

## **Visualisation**

In [None]:
# @title Chart 1: Temporal distribution of tweets
df['created_at'] = df['created_at'].astype(str).str[:10]
df['created_at'] = pd.to_datetime(df['created_at'])

df_group = df.groupby('created_at')['text'].count().to_frame(name='count')
df_group = df_group.reset_index()

fig = go.Figure()
fig.add_trace(go.Bar(x=df_group['created_at'],
                y=df_group['count'],
                marker_color='rgb(55, 83, 109)'
                ))
fig.update_layout(title_text='Temporal distribution of tweets')

In [None]:
# @title Chart 2: Temporal distribution of tweets by month
df_month = df_group
df_month['created_at'] = df_month['created_at'].astype(str).str[:7]
df_month = df_month.groupby('created_at')['count'].sum().to_frame(name='count_month')
df_month = df_month.reset_index()
fig = go.Figure()
fig.add_trace(go.Bar(x=df_month['created_at'],
                y=df_month['count_month'],
                marker_color='rgb(55, 83, 109)'
                ))
fig.update_layout(title_text='Temporal distribution of tweets per month')

In [None]:
# @title Chart 3: Temporal distribution of tweets by user (per month)
user = 'netzpolitik' # @param {type:"string"}
pd.set_option('mode.chained_assignment', None)
df_user = df[df['user_username'] == user]
df_user['created_at'] = df_user['created_at'].astype(str).str[:7]
df_user = df_user.groupby('created_at')['text'].count().to_frame(name='count')
df_user = df_user.reset_index()

fig = go.Figure()
fig.add_trace(go.Bar(x=df_user['created_at'],
                y=df_user['count'],
                marker_color='rgb(55, 83, 109)'
                ))
fig.update_layout(title_text='Temporal distribution of tweets for user: ' + user)

In [None]:
# @title Chart 4: Temporal distribution of tweets by cluster (per month)

df_blue = df[df['cluster'] == 'blue']
df_purple = df[df['cluster'] == 'purple']

df_blue['created_at'] = df_blue['created_at'].astype(str).str[:7]
df_purple['created_at'] = df_purple['created_at'].astype(str).str[:7]

df_blue = df_blue.groupby('created_at')['text'].count().to_frame(name='count')
df_purple = df_purple.groupby('created_at')['text'].count().to_frame(name='count')

df_blue = df_blue.reset_index()
df_purple = df_purple.reset_index()

fig = go.Figure()
fig.add_trace(go.Bar(x=df_blue['created_at'],
                y=df_blue['count'],
                marker_color='#0073e6'
                ))
fig.add_trace(go.Bar(x=df_purple['created_at'],
                y=df_purple['count'],
                marker_color='#9B8BF4'
                ))
fig.update_layout(title_text='Temporal distribution of tweets by cluster (per month)')

In [None]:
# @title Chart 5: Likes and Retweets of Tweets Over Time

fig = px.scatter(df, x="created_at", y="like_count", size="retweet_count", color="cluster", hover_name="user_username",
                 title="Likes and Retweets of Tweets Over Time")
fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Number of Likes",
    legend_title="Cluster",
)
fig.show()


## **What happened in January 2018?**



**Statistics:**

*   **560** tweets were posted in January 2018 by **27** users
*   **13** users belong to cluster blue and **14** to cluster purple
*   **234** tweets were posted by cluster: **purple**
*   **326** tweets were posted by cluster: **blue**






In [None]:
# @title Run this cell to generate data about January 2018

df_2018 = df[df['created_at'].astype(str).str.startswith('2018-01')]
print('Complete!')

In [None]:
# @title Number of tweets per user in January 2018

df_2018['user_username'].value_counts()


In [None]:
# @title Set parameters to see list of n-grams for Jan 2018

# Remove underscore from tweets (To avoid erros in deleting users like @balzer_sascha)

df_2018['text_clean'] = df_2018['text'].map(lambda x: cleaner(x))

# Remove users, remove URLs, remove hashtag sign

df_2018['text_clean'] = df_2018['text'].map(lambda x: cleaner(x))

df_2018 = df_2018.drop_duplicates(subset=['text']) #remove duplicated tweets in column 'text'

# Edit your list of stopwords manually

add_to_stopwords = ['rt','twitter']
stopwords = stopwords + add_to_stopwords

# Define content
user_2018 = 'netzpolitik' # @param {type:"string"}
cluster_2018 = 'blue' # @param {type:"string"}
all_2018 = df_2018[['text_clean']]
df_user_2018 = df_2018[df_2018['user_username'] == user_2018]
per_user_2018 = df_user_2018[['text_clean']]
df_cluster_2018 = df_2018[df_2018['cluster'] == cluster_2018]
per_cluster_2018 = df_cluster_2018[['text_clean']]

content_2018 = all_2018 # @param {type:"raw"}
content_2018.reset_index(drop=True, inplace=True)
content_2018 = content_2018.values.tolist()

string_2018 = '\n'.join(str(e) for e in content_2018)

items_in_list = 20 # @param {type:"number"}

unigrams_2018 = get_top_n_words([string_2018], stopwords=stopwords, n=items_in_list)
bigrams_2018 = get_top_n_bigram([string_2018], stopwords=stopwords, n=items_in_list)
see_top_words = bigrams_2018 # @param {type:"raw"}

see_top_words

In [None]:
# @title See dataset sorted by like or retweet count
sort_by = "retweet_count" # @param ["like_count","retweet_count"]
df_2018.sort_values(by=[sort_by], ascending=False)

In [None]:
# @title See dataset sorted by date
sort_by = "created_at"
df_2018.sort_values(by=[sort_by], ascending=True)

In [None]:
# @title Chart 6: Temporal distribution of tweets per day
df_2018viz = df_2018.groupby('created_at')['text'].count().to_frame(name='count')
df_2018viz = df_2018viz.reset_index()
fig = go.Figure()
fig.add_trace(go.Bar(x=df_2018viz['created_at'],
                y=df_2018viz['count'],
                marker_color='rgb(55, 83, 109)'
                ))
fig.update_layout(title_text='Temporal distribution of tweets in January 2018')

In [None]:
# @title Chart 7: Temporal distribution of tweets in Jan 2018 (by user)

df_user_date = df_2018.groupby(['created_at', 'user_username'])['text'].count().unstack()
fig = px.bar(df_user_date, barmode='stack')
fig.show()


In [None]:
# @title Chart 8: Likes and Retweets of Tweets in Jan 2018

fig = px.scatter(df_2018, x="created_at", y="like_count", size="retweet_count", color="cluster", hover_name="user_username",
                 title="Likes and Retweets of Tweets Over Time")
fig.update_layout(
    xaxis_title="Date",
    yaxis_title="Number of Likes",
    legend_title="Cluster",
)
fig.show()

## **Comparison between June 2017, January 2018 and May 2019**


**Statistics:**

June 2017:
*   **376** tweets were posted in June 2017 by **26** users
*   **14** users belong to cluster blue and **12** to cluster purple
*   **132** tweets were posted by cluster: **purple**
*   **244** tweets were posted by cluster: **blue**


January 2018:
*   **560** tweets were posted in January 2018 by **27** users
*   **13** users belong to cluster blue and **14** to cluster purple
*   **234** tweets were posted by cluster: **purple**
*   **326** tweets were posted by cluster: **blue**

May 2019:
*   **74** tweets were posted in May 2019 by **14** users
*   **5** users belong to cluster blue and **9** to cluster purple
*   **33** tweets were posted by cluster: **purple**
*   **41** tweets were posted by cluster: **blue**






In [None]:
# @title Run this cell to generate data about June 2017 and May 2019

df_2017 = df[df['created_at'].astype(str).str.startswith('2017-06')]
df_2019 = df[df['created_at'].astype(str).str.startswith('2019-05')]
print('Complete!')

In [None]:
# @title Number of tweets per user in June 2017

df_2017['user_username'].value_counts()

In [None]:
# @title Number of tweets per user in May 2019

df_2019['user_username'].value_counts()

In [None]:
# @title Set parameters to see list of n-grams for June 2017

# Remove underscore from tweets (To avoid erros in deleting users like @balzer_sascha)

df_2017['text_clean'] = df_2017['text'].map(lambda x: cleaner(x))

# Remove users, remove URLs, remove hashtag sign

df_2017['text_clean'] = df_2017['text'].map(lambda x: cleaner(x))

df_2017 = df_2017.drop_duplicates(subset=['text']) #remove duplicated tweets in column 'text'

# Edit your list of stopwords manually

add_to_stopwords = ['rt','twitter']
stopwords = stopwords + add_to_stopwords

# Define content
user_2017 = 'netzpolitik' # @param {type:"string"}
cluster_2017 = 'blue' # @param {type:"string"}
all_2017 = df_2017[['text_clean']]
df_user_2017 = df_2017[df_2017['user_username'] == user_2017]
per_user_2017 = df_user_2017[['text_clean']]
df_cluster_2017 = df_2017[df_2017['cluster'] == cluster_2017]
per_cluster_2017 = df_cluster_2017[['text_clean']]

content_2017 = all_2017 # @param {type:"raw"}
content_2017.reset_index(drop=True, inplace=True)
content_2017 = content_2017.values.tolist()

string_2017 = '\n'.join(str(e) for e in content_2017)

items_in_list = 20 # @param {type:"number"}

unigrams_2017 = get_top_n_words([string_2017], stopwords=stopwords, n=items_in_list)
bigrams_2017 = get_top_n_bigram([string_2017], stopwords=stopwords, n=items_in_list)
see_top_words = bigrams_2017 # @param {type:"raw"}

see_top_words

In [None]:
# @title Set parameters to see list of n-grams for May 2019

# Remove underscore from tweets (To avoid erros in deleting users like @balzer_sascha)

df_2019['text_clean'] = df_2019['text'].map(lambda x: cleaner(x))

# Remove users, remove URLs, remove hashtag sign

df_2019['text_clean'] = df_2019['text'].map(lambda x: cleaner(x))

df_2019 = df_2019.drop_duplicates(subset=['text']) #remove duplicated tweets in column 'text'

# Edit your list of stopwords manually

add_to_stopwords = ['rt','twitter']
stopwords = stopwords + add_to_stopwords

# Define content
user_2019 = 'netzpolitik' # @param {type:"string"}
cluster_2019 = 'blue' # @param {type:"string"}
all_2019 = df_2019[['text_clean']]
df_user_2019 = df_2019[df_2019['user_username'] == user_2019]
per_user_2019 = df_user_2019[['text_clean']]
df_cluster_2019 = df_2019[df_2019['cluster'] == cluster_2019]
per_cluster_2019 = df_cluster_2019[['text_clean']]

content_2019 = all_2019 # @param {type:"raw"}
content_2019.reset_index(drop=True, inplace=True)
content_2019 = content_2019.values.tolist()

string_2019 = '\n'.join(str(e) for e in content_2019)

items_in_list = 20 # @param {type:"number"}

unigrams_2019 = get_top_n_words([string_2019], stopwords=stopwords, n=items_in_list)
bigrams_2019 = get_top_n_bigram([string_2019], stopwords=stopwords, n=items_in_list)
see_top_words = bigrams_2019 # @param {type:"raw"}

see_top_words

In [None]:
# @title Chart 9: Temporal distribution of tweets in June 2017 (by user)

df_user_date7 = df_2017.groupby(['created_at', 'user_username'])['text'].count().unstack()
fig = px.bar(df_user_date7, barmode='stack')
fig.show()

In [None]:
# @title Chart 10: Temporal distribution of tweets in May 2019 (by user)

df_user_date9 = df_2019.groupby(['created_at', 'user_username'])['text'].count().unstack()
fig = px.bar(df_user_date9, barmode='stack')
fig.show()

In [None]:
df.sort_values(by=['retweet_count'], ascending=False)

---