In [None]:
%matplotlib inline

from collections import Counter
from itertools import combinations
import matplotlib.pyplot as plt
import pandas as pd
import re
from tqdm import tqdm
from wordcloud import WordCloud

In [None]:
tqdm.pandas()

# Explanatory data analysis of tags.csv

In [None]:
df_tags = pd.read_csv("data/tags.csv", encoding="ISO-8859-1")

In [None]:
# Count number of rows (1.9 million)
len(df_tags.index)

In [None]:
# First rows
df_tags.head(20)

In [None]:
# Last rows
df_tags.tail(20)

In [None]:
# Count different IDs (607k)
df_tags["Id"].nunique()

In [None]:
# Count different tags (17k)
df_tags["Tag"].nunique()

In [None]:
tag_counts = df_tags["Tag"].value_counts()

In [None]:
# Most common tags (many python related tags)
tag_counts.head(20)

In [None]:
# Least common tags (many tags occur only once)
tag_counts.tail(20)

In [None]:
# Count number of tags per ID
tags_per_id = df_tags.groupby("Id").size()

In [None]:
# Plot distribution
plt.figure(figsize=(10, 6))
plt.hist(tags_per_id, bins=50)
plt.xlabel("Number of Tags")
plt.ylabel("Number of occurences")
plt.show()

In [None]:
# Get average number of tags per ID (3.1)
tags_per_id.mean()

In [None]:
# Get maximum number of tags per ID (5)
tags_per_id.max()

In [None]:
# Function to calculate tag co-occurrence using the Counter class
def calculate_cooccurrence(dataframe: pd.DataFrame) -> Counter:
    cooccurrence = Counter()
    for tags in dataframe.groupby("Id")["Tag"]:
        for tag_pair in combinations(tags[1], 2):
            cooccurrence[tag_pair] += 1
    return cooccurrence

In [None]:
# Calculate tag co-occurrence
tag_cooccurrence = calculate_cooccurrence(df_tags).most_common()

In [None]:
# Get the most common co-occurring tag pairs
tag_cooccurrence[0:20]

In [None]:
# Get the least common co-occurring tag pairs
tag_cooccurrence[-20:]

# Explanatory data analysis of questions.csv

In [None]:
df_quest = pd.read_csv("data/questions.csv", encoding="ISO-8859-1")

In [None]:
# Count number of rows (607k, should be indentical to the number of IDs in tags.csv)
len(df_quest.index)

In [None]:
# First rows
df_quest.head(20)

In [None]:
# Last rows
df_quest.tail(20)

In [None]:
df_quest["CreationDate"].min()

In [None]:
df_quest["CreationDate"].max()

**Since the task is to train a NLP model which predicts the tags of questions and due to the time constraints, i will do a EDA only for the columns Title and Body**

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
df_quest["Title"][0:20]

In [None]:
df_quest["Body"][0:20]

Looking at the examples there are some findings:
- There are many HTML tags which are irrelevant for the classification task
- In addition, there are also many newline and carriage return characters that can be removed
- The chars < and > occur in some source codes and are decoded
- One could think about removing special characters and numbers

In [None]:
# Basic preprocessing of the text columns
def preprocess_text(text: str) -> str:
    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)

    # Replace \n and \r with a whitespace
    text = text.replace("\n", " ").replace("\r", " ")

    # Decode &lt; and &gt;
    text = text.replace("&lt;", "<").replace("&gt;", ">")

    # Remove multiple whitespaces with single whitespace
    text = re.sub(r"\s+", " ", text).strip()

    # Lowercase text
    text = text.lower()

    return text

In [None]:
df_quest["Title_Clean"] = df_quest["Title"].progress_apply(preprocess_text)

In [None]:
df_quest["Body_Clean"] = df_quest["Body"].progress_apply(preprocess_text)

In [None]:
df_quest["Title_Clean"][0:20]

In [None]:
df_quest["Body_Clean"][0:20]

In [None]:
df_quest["Body_Clean"][-20:]

In [None]:
# Calculating length and word count
df_quest["Title_length"] = df_quest["Title"].apply(len)
df_quest["Body_length"] = df_quest["Body"].apply(len)
df_quest["Title_word_count"] = df_quest["Title"].apply(lambda x: len(x.split()))
df_quest["Body_word_count"] = df_quest["Body"].apply(lambda x: len(x.split()))

In [None]:
# Plotting
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Length Distributions
axes[0, 0].hist(df_quest["Title_length"], bins=30, color='blue', alpha=0.7)
axes[0, 0].set_title('Title Length Distribution')

axes[0, 1].hist(df_quest["Body_length"], bins=30, color='green', alpha=0.7, log=True)
axes[0, 1].set_title('Body Length Distribution')

# Word Count Distributions
axes[1, 0].hist(df_quest["Title_word_count"], bins=30, color='red', alpha=0.7)
axes[1, 0].set_title('Title Word Count Distribution')

axes[1, 1].hist(df_quest["Body_word_count"], bins=30, color='orange', alpha=0.7, log=True)
axes[1, 1].set_title('Body Word Count Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Generate word cloud for Title column
wordcloud_title = WordCloud(width = 800, height = 800, background_color ='white', max_words=200).generate(" ".join(df_quest['Title_Clean']))

In [None]:
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud_title, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Title')
plt.show()