<a href="https://colab.research.google.com/github/caiocmello/netzdg/blob/main/NetzDG_context.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NetzDG Context**
This notebook facilitates exploring the context of debates on NetzDG data. Datasets to be used are two:
1. 'df_netzdg_blue&purple.csv'
2. 'df_colours.csv'

In [None]:
# @title Run this cell to load the necessary libraries and data
import pandas as pd
import nltk
import spacy
import plotly.express as px
import plotly.graph_objects as go
import re
import os
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from spacy.lang.en.stop_words import STOP_WORDS as en_stopwords
from spacy.lang.de.stop_words import STOP_WORDS as de_stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Load the data frame into colab
#url = ('/Users/caio.mello/Documents/NetzDG_project/NetzDG_data/df_netzdg_blue&purple.csv')
url = 'df_netzdg_blue&purple.csv'
df = pd.read_csv(url,index_col=0)
#df_colours = pd.read_csv('/Users/caio.mello/Documents/NetzDG_project/NetzDG_data/df_colours.csv', encoding='latin1')
file_id = '1NJBXn4o4TBA6mh1gbABjEtUwZRHczstW'
url2 = f'https://drive.google.com/uc?id={file_id}'
df_colours = pd.read_csv(url2, encoding='latin1')
pd.set_option('display.max_colwidth', None)
# prompt: remove duplicated values in column 'text'
df.drop_duplicates(subset='text', inplace=True)
df_colours.drop_duplicates(subset='text', inplace=True)
df_colours = df_colours[['user_username', 'text','retweet_count', 'like_count', 'created_at', 'cluster']]
conc_df = pd.concat([df, df_colours], axis=0)

## Context Analysis

In [None]:
# @title Choose a word to see context (PS: if column 'word' is 'None', word is probably used as hashtag in text)
word = "Freiheit" # @param {type:"string"}

sentences_with_word = conc_df[conc_df['text'].str.contains(word, case=False, na=False)]

# Split the sentences at the word 'x'
split_sentences = sentences_with_word['text'].str.split(word, n=1, expand=True)

# Rename the columns
split_sentences.columns = ['Before', 'After']
split_sentences.insert(1, 'word', word)
split_sentences['word'] = split_sentences.apply(lambda row: 'None' if pd.isnull(row['After']) else row['word'], axis=1)
#split_sentences
merged_df = pd.concat([split_sentences, conc_df], axis=1)
merged_df = merged_df[['user_username','Before', 'word', 'After', 'retweet_count', 'like_count', 'created_at', 'cluster']]
merged_df = merged_df.reset_index()
merged_df = merged_df.drop(columns=['index'])
merged_df = merged_df.dropna(subset=['Before'])
merged_df

In [None]:
# Count how many tweets were posted by user containing the above searched word 
merged_df['user_username'].value_counts()

In [None]:
# @title Generate a wordcloud with the most mentioned words in text where the word chosen above is mentioned
# prompt: create a wordcloud with words mentioned in column 'text' when word 'Zensur' is found

from wordcloud import WordCloud
import matplotlib.pyplot as plt

stopwords_de = list(de_stopwords)
stopwords_en = list(en_stopwords)
stopwords = stopwords_de + stopwords_en #create a mixed list of stopwords (German and English)
# Edit your list of stopwords manually
add_to_stopwords = ['rt', 'https', 't', 'co']
stopwords = stopwords + add_to_stopwords

sentences_with_word = conc_df[conc_df['text'].str.contains(word, case=False, na=False)]
text = " ".join(sentences_with_word['text'].tolist())

wordcloud = WordCloud(width=800, height=600, background_color='white', stopwords=stopwords).generate(text)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# @title Choose a word to see context per cluster (blue or purple)
cluster = 'orange' # @param {type:"string"}

df_cluster = conc_df[conc_df['cluster'] == cluster]
sentences_with_word_cluster = df_cluster[df_cluster['text'].str.contains(word, case=False, na=False)]

# Split the sentences at the word 'x'
split_sentences_cluster = sentences_with_word_cluster['text'].str.split(word, n=1, expand=True)

# Rename the columns
split_sentences_cluster.columns = ['Before', 'After']
split_sentences_cluster.insert(1, 'word', word)
split_sentences_cluster['word'] = split_sentences_cluster.apply(lambda row: 'None' if pd.isnull(row['After']) else row['word'], axis=1)
split_sentences_cluster

In [None]:
# @title Generate a wordcloud for BLUE
# prompt: create a wordcloud with words mentioned in column 'text' when word 'Zensur' is found

df_blue = conc_df[conc_df['cluster'] == 'blue']
df_purple = conc_df[conc_df['cluster'] == 'purple']
sentences_with_word_blue = df_blue[df_blue['text'].str.contains(word, case=False, na=False)]
sentences_with_word_purple = df_purple[df_purple['text'].str.contains(word, case=False, na=False)]

text_blue = " ".join(sentences_with_word_blue['text'].tolist())
text_purple = " ".join(sentences_with_word_purple['text'].tolist())

wordcloud_b = WordCloud(width=800, height=600, background_color='white', stopwords=stopwords).generate(text_blue)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud_b, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# @title Generate a wordcloud for PURPLE
wordcloud_p = WordCloud(width=800, height=600, background_color='white', stopwords=stopwords).generate(text_purple)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud_p, interpolation='bilinear')
plt.axis('off')
plt.show()