# ACIT5900: Master Thesis
### *Exploratory Data Analysis*

>-------------------------------------------
> *Spring 2025*

>--------------------------------------------

<a id="top"></a>
1. [**Basic Statistics**](#statistics)<br>
2. [**Visualize Content Column**](#content-visualization)<br>
3. [**Visualize Other Columns**](#others-visualization)<br>
4. [**Co-authorship Knowledge Graph**](#knowledge-graph)<br>

In [None]:
!pip install networkx matplotlib

In [None]:
# import modules needed
import re
import string
import pandas as pd
import networkx as nx
from ast import literal_eval
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from itertools import combinations
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [None]:
# read data
df = pd.read_csv('df_cleaned.csv')
df.head()

## <a id="statistics"></a> 1) Basic Statistics

[⬆️ Back to Top](#top)

In [None]:
# investigate basic statistics
df['text_length'] = df['cleaned_content'].apply(len)  
df['text_length'].describe() 

## <a id="content-visualization"></a> 2) Visualize Content Column

[⬆️ Back to Top](#top)

In [None]:
# list of words to remove
stop_words = set(ENGLISH_STOP_WORDS)  

# tokenize the text and remove stop words, punctuation, and numbers
words = ' '.join(df['cleaned_content']).split()
filtered_words = [
    word for word in words 
    if word.lower() not in stop_words and word not in string.punctuation and not word.isdigit()
]

# count word frequencies
word_counts = Counter(filtered_words)

# top 20 most common words
common_words = word_counts.most_common(20)
print(common_words)

In [None]:
# create word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)

# plot the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english', max_features=20)
X = vectorizer.fit_transform(df['cleaned_content'])

# extract top 20 bigrams
bigram_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print(bigram_df.sum(axis=0).sort_values(ascending=False).head(20)) 


In [None]:
# words to exclude from bigram
stop_words = ['et al', '10 10', 'doi']

# extract bigrams without limiting the number of features
vectorizer = CountVectorizer(ngram_range=(2, 2), stop_words='english')
X = vectorizer.fit_transform(df['cleaned_content'])
bigram_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# get all bigrams
all_bigrams = bigram_df.sum(axis=0)

# filter out the unwanted bigrams
filtered_bigrams = all_bigrams[~all_bigrams.index.str.contains('|'.join(stop_words))]

# create word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(filtered_bigrams.to_dict())

# plot the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

## <a id="others-visualization"></a> 3) Visualize Other Columns

[⬆️ Back to Top](#top)

In [None]:
# amount of articles extracted for each year
df['year_published'].value_counts().sort_index().plot(kind='bar', figsize=(10, 5))
plt.title('Distribution of Articles by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.show()

## <a id="knowledge-graph"></a> 4) Co-authorship Knowledge Graph

[⬆️ Back to Top](#top)

In [None]:
df[['cleaned_authors']]

In [None]:
# convert to python list
df['cleaned_authors'] = df['cleaned_authors'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)

In [None]:
def normalize_authors(authors):
    """
    Clean and normalize names to 'Firstname Lastname' format,
    splitting up incorrectly merged names if needed.
    """
    normalized = []
    for name in authors:
        
        # split if multiple names got merged into one string
        if ',' in name and len(name.split()) > 4:
            parts = re.split(r',| and ', name)
        else:
            parts = [name]
        
        for part in parts:
            part = part.strip()
            if ',' in part:
                last, first = part.split(',', 1)
                full_name = f"{first.strip()} {last.strip()}"
            else:
                full_name = part
            normalized.append(full_name)
    return normalized

df['normalized_authors'] = df['cleaned_authors'].apply(normalize_authors)

In [None]:
def merge_variants(authors):
    """
    Merge name variants using a mix of fuzzy initials + last name,
    plus manual aliasing for known cases like 'Gustavo Mello'.
    """
    merged = []
    for name in authors:
        name_clean = name.lower().strip()
        parts = name_clean.split()
        
        # fuzzy key: first initial + last name
        if len(parts) > 1:
            fuzzy_key = f"{parts[0][0]}_{parts[-1]}"
        else:
            fuzzy_key = name_clean

        # manual fix for Gustavo Mello variants
        if ("gustavo" in name_clean and "mello" in name_clean) or name_clean in [
            "mello", "g. mello", "g. b. m. mello", "gustavo b. m.",
            "gustavo borges mello", "gustavo moreno mello",
            "gustavo borges moreno e.", "gustavo borges moreno e. mello"
        ]:
            canonical = "Gustavo Mello"

        else:
            # choose name from longest in fuzzy group 
            existing = [a for a in authors if a.lower().startswith(parts[0][0]) and a.lower().endswith(parts[-1])]
            canonical = max(existing, key=len) if existing else name

        merged.append(canonical)
    return merged

df['merged_authors'] = df['normalized_authors'].apply(merge_variants)

In [None]:
edges = []
for authors in df['merged_authors']:
    unique_authors = list(set(authors))
    edges += combinations(unique_authors, 2)

# count co-authorship frequency
edge_weights = Counter(edges)

# build and visualize the graph 
G = nx.Graph()
for (a1, a2), weight in edge_weights.items():
    G.add_edge(a1, a2, weight=weight)

# draw the graph
plt.figure(figsize=(18, 18))
pos = nx.spring_layout(G, k=0.7, seed=42)
degrees = dict(G.degree())

nx.draw_networkx_nodes(
    G, pos,
    node_size=[300 + degrees[n] * 100 for n in G.nodes],
    node_color='lightblue',
    edgecolors='black'
)
nx.draw_networkx_edges(
    G, pos,
    width=[G[u][v]['weight'] for u, v in G.edges],
    alpha=0.5
)
nx.draw_networkx_labels(G, pos, font_size=10)

plt.title("🧹 Final Co-authorship Graph — Cleaned & Merged", fontsize=18)
plt.axis('off')
plt.tight_layout()
plt.show()
