<a href="https://colab.research.google.com/github/andrybrew/IHT-SEM1302-30Okt/blob/main/practice_material/005_text_network.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Text Network Analysis**

##**Importing required libraries**

In [None]:
import pandas as pd
import re
import nltk
import networkx as nx
import matplotlib.pyplot as plt

from nltk.corpus import stopwords

nltk.download('stopwords')

##**Importing Dataset**

In [None]:
# Fetching the dataset from GitHub
data_url = "https://raw.githubusercontent.com/andrybrew/IHT-SEM1302-30Okt/main/data/001_suku-bunga.csv"

# Using pandas read_csv function to load the data from the URL directly into a DataFrame
df_tweet = pd.read_csv(data_url)

## **Text Network Analysis Process**

###Dataset Preprocessing

In [None]:
# Convert to lowercase
df_tweet['text'] = df_tweet['text'].str.lower()

# Remove URL
df_tweet['text'] = df_tweet['text'].apply(lambda x: re.sub(r'https?://\S+|www\.\S+|\S+\.\S+/\S+', '', x))

# Remove mentions
df_tweet['text'] = df_tweet['text'].str.replace(r'@\S+', '', regex=True)

# Remove non-word characters except for spaces and %
df_tweet['text'] = df_tweet['text'].str.replace(r'[^\w\s%]', '', regex=True)

# Trim leading and trailing spaces and replace multiple spaces with a single space
df_tweet['text'] = df_tweet['text'].str.strip().str.replace(r'\s+', ' ', regex=True)

# Remove stopwords including custom stop words
stop_words = stopwords.words('indonesian')
custom_stop_words = ['dgn', 'sdh', 'yg', 'the', 'gak', 'ga', 'a', 'krn', 'thd', 'nya', 'ya', 'n', 'kalo', 'aja', 'deh', 'tuh', 'udah', 'dll.', '2', '25', '20', '1.', '2.', '7.', 'u', '5', 'gua', '•']
stop_words_pattern = r'\b(?:{})\b'.format('|'.join(stop_words + custom_stop_words))
df_tweet['text'] = df_tweet['text'].str.replace(stop_words_pattern, '', regex=True)

# Copy the processed text to a new DataFrame
df_tna = pd.DataFrame(df_tweet['text'], columns=['text'])
df_tna

###Building the Network Graph

In [None]:
# Create empty graph
G = nx.Graph()

In [None]:
# For each row in dataframe
for index, row in df_tna.iterrows():

    # Split tittle into words
    words = row['text'].split(' ')

    # Add relationships between words
    for w1 in words:
        for w2 in words:
            if w1 != w2:
                G.add_edge(w1,w2)

##Visualizing the Network


In [None]:
# Visualize the graph
nx.draw(G, font_size = 7, with_labels = True)

In [None]:
# Save the network's edgelist to a CSV
nx.write_edgelist(G, "text_network_edgelist.csv", delimiter=",", data=False)