# Exploratory Data Analysis

# Get necessary packages and Data
Check your data

In [51]:
import re
import string
import spacy
nlp = spacy.load('en_core_web_lg')
import pickle
import pandas as pd
pd.set_option('display.max_rows', 10)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/bilgesipal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bilgesipal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Get the Data
with open('../df_data.pkl', "rb") as f:
    df_data = pickle.load(f)
print(len(df_data))

with open('../label.pkl', "rb") as f:
    label = pickle.load(f)

In [None]:
print('type of the data:', type(df_data))
print('size of the data:', len(df_data))

In [None]:
df_data[:2]

In [None]:
str(label[0])

In [None]:
label_list = label.tolist()
label = [str(i) for i in label_list]

## Convert the Corpus to a Data Frame

In [None]:
# Create list of dictionaries for each entry
stp_corpus = [{'data': data, 'label': label} for data, label in zip(df_data, label_list)]

# Convert to DataFrame
df = pd.DataFrame(stp_corpus)

df.head()


In [None]:
data = df['data'].iloc[0]
data

# Clean the Data

Text data cleaning, also known as text pre-processing, is essential for improving model performance. Since text cleaning can be an endless process, we will start simple and refining iteratively.
Basic Cleaning Steps:

    Convert text to lowercase
    Remove punctuation and numbers
    Eliminate non-sensical text (e.g., \n)
    Tokenize text
    Remove stop words

Advanced Cleaning After Tokenization:

    Stemming/Lemmatization
    POS tagging
    Bigram/Trigram creation
    Handling typos

More refinements can be applied later to enhance results.




## Examples --> Playground

In [None]:
import re

# Original text
text = """From: bil@okcforum.osrhe.edu (Bill Conner)\nSubject: Re: Not the Omni!\nNntp-Posting-Host: okcforum.osrhe.edu\nOrganization: Okcforum Unix Users Group\nX-Newsreader: TIN [version 1.1 PL6]\nLines: 18\n\nCharley Wingate (mangoe@cs.umd.edu) wrote:\n: \n: >> Please enlighten me.  How is omnipotence contradictory?\n: \n: >By definition, all that can occur in the universe is governed by the rules\n: >of nature. Thus god cannot break them. Anything that god does must be allowed\n: >in the rules somewhere. Therefore, omnipotence CANNOT exist! It contradicts\n: >the rules of nature.\n: \n: Obviously, an omnipotent god can change the rules.\n\nWhen you say, "By definition", what exactly is being defined;\ncertainly not omnipotence. You seem to be saying that the "rules of\nnature" are pre-existant somehow, that they not only define nature but\nactually cause it. If that\'s what you mean I\'d like to hear your\nfurther thoughts on the question.\n\nBill\n"""

#  Remove email headers
cleaned_text = re.sub(r"^(From|Subject|Nntp-Posting-Host|Organization|X-Newsreader|Lines):.*\n?", "", text, flags=re.MULTILINE)

# Remove email addresses
cleaned_text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "", cleaned_text)

 #  Remove all < and > characters
cleaned_text = re.sub(r"[<>]", "", cleaned_text)

 # Remove extra colons and quote symbols (:, >)
cleaned_text = re.sub(r"^[:>\s]+", "", cleaned_text, flags=re.MULTILINE)

# Remove all newlines and replace with a space
#This line of code removes extra newlines (\n) and replaces them with a single space while also trimming any leading or trailing spaces.
cleaned_text = re.sub(r"\n+", " ", cleaned_text).strip()

# Display cleaned text
print(cleaned_text)


In [None]:
doc = nlp(cleaned_text)

In [None]:

# Extract lemmatized text (excluding stopwords and punctuation)
lemmatized_text = " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Display cleaned, lemmatized, and stopword-removed text
print(lemmatized_text)

In [None]:
type(lemmatized_text)

In [None]:
# # Pickle files for later use

#     with open("data.txt", "wb") as file:
#         pickle.dump(transcripts[i], file)

## The Cleaner Functions

In [None]:
def clean_text(text):
    """
    Cleans the input text by performing various preprocessing steps:

    1. **Removes Metadata Headers**: Deletes lines that start with
       'From', 'Subject', 'Nntp-Posting-Host', 'Organization',
       'X-Newsreader', or 'Lines'.
    2. **Removes Email Addresses**: Matches and removes any email-like patterns.
    3. **Removes Quotation Symbols and Leading Spaces**: Removes `:` and `>`
       characters at the beginning of lines.
    4. **Removes Digits**: Eliminates all numeric characters.
    5. **Removes Extra Newlines**: Replaces multiple newlines with a single space.
    6. **Removes Angle Brackets (`< >`)**: Deletes any occurrences of `<` and `>`.

    Args:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text with unwanted elements removed.
    """
    cleaned_text = re.sub(r"^(From|Subject|Nntp-Posting-Host|Organization|X-Newsreader|Lines):.*\n?", "", text, flags=re.MULTILINE)
    cleaned_text = re.sub(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", "", cleaned_text)
    cleaned_text = re.sub(r"^[:>\s]+", "", cleaned_text, flags=re.MULTILINE)
    cleaned_text = re.sub(r"\d+", "", cleaned_text)
    cleaned_text = re.sub(r"\n+", " ", cleaned_text).strip()
    cleaned_text = re.sub(r"[<>]", "", cleaned_text)

    return cleaned_text


In [None]:
def lemmatizer(cleaned_text):
    """
    Applies lemmatization to the input text using SpaCy, while also removing stop words and punctuation.

    Steps:
    1. **Tokenizes** the input text using a SpaCy NLP model.
    2. **Lemmatizes** each token (converts words to their base form).
    3. **Removes Stop Words** (e.g., "the", "is") to retain only meaningful words.
    4. **Removes Punctuation** to clean the text further.
    5. **Joins the lemmatized words** back into a single string.

    Args:
        cleaned_text (str): The input text that has already been preprocessed.

    Returns:
        str: The lemmatized text with stop words and punctuation removed.

    Example:
        import spacy
        nlp = spacy.load("en_core_web_sm")
        cleaned_text = "The dogs are running quickly towards the park."
        print(lemmatizer(cleaned_text))
        'dog run quickly park'
    """

    doc = nlp(cleaned_text)
    lemmatized_text = " ".join([str(token.lemma_) for token in doc if not token.is_stop and not token.is_punct])

    return lemmatized_text

In [None]:
df.head()

In [None]:
df['clean_text'] = df['data'].apply(lambda x: clean_text(x))

In [None]:
df.clean_text.iloc[2]

In [None]:
df['new_data'] = df.clean_text.apply(lambda x: lemmatizer(x))

In [None]:
with open("../clean_df.pkl", "wb") as file:
    pickle.dump(df[:100], file)

## Visualise --EDA

### WORD CLOUDS

A Word Cloud (also called a tag cloud) is a visual representation of text data, where:

    More frequent words appear larger, while
    Less frequent words appear smaller

It is commonly used in NLP and text analysis to identify important words in a dataset.
Example Use Cases

    Analyzing customer reviews (e.g., finding common themes in feedback)
    Summarizing large text data (e.g., extracting key terms from news articles)
    Exploring social media trends (e.g., identifying frequently used words in tweets)

In [None]:
with open("../clean_df.pkl", "rb") as file:
    data_clean = pickle.load(file)

In [None]:
data_clean.head()

In [None]:
data_clean.label.value_counts()

In [None]:
grouped_df = data_clean.groupby('label')['new_data'].agg(' '.join).reset_index().copy()
grouped_df

In [None]:
# Let's make some word clouds!
from wordcloud import WordCloud

wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)

In [None]:
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [16, 6]

full_names = ['0', '1']

# Create subplots for each comedian
for i in range(2):
    wc.generate(grouped_df.new_data.iloc[i])


    plt.subplot(3, 4, 2+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(full_names[i])
    plt.show()

## Create document X term matrix and Do Further Analysis

In [None]:
# Let's create our document-term matrix
from sklearn.feature_extraction.text import CountVectorizer

# Recreate document-term matrix
cv = CountVectorizer()
data_cv = cv.fit_transform(grouped_df.new_data)



In [None]:
data_new = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())
data_new.index = data_new.index


In [None]:
data_new.head()

In [None]:
data_new.columns[:10]

In [None]:
data_new['abbreviations'].sort_values(ascending=False)

In [None]:
data = data_new.transpose()
data.head()


In [None]:
# Find the top 10 words in each topic
top_dict = {}
for c in data:
    top = data[c].sort_values(ascending=False).head(10)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

In [None]:
# Print the top 10 words per topic
for topic, top_words in top_dict.items():
    print(topic)
    print(', '.join([word for word, count in top_words[0:10]]))
    print('---')

In [None]:
# Find the number of unique words that each topic has

# Identify the non-zero items in the document-term matrix, meaning that the word occurs at least once
unique_list = []
for comedian in data.columns:
    uniques = data[comedian].to_numpy().nonzero()[0].size
    unique_list.append(uniques)

# Create a new dataframe that contains this unique word count
data_words = pd.DataFrame(list(zip(full_names, unique_list)), columns=['topic', 'unique_words'])
data_unique_sort = data_words.sort_values(by='unique_words')
data_unique_sort

In [None]:
# Calculate the words per mail of each topic

# Find the total number of words that topic has
total_list = []
for comedian in data.columns:
    totals = sum(data[comedian])
    total_list.append(totals)

#
mails = [46, 54]

# Let's add some columns to our dataframe
data_words['total_words'] = total_list
data_words['mail_number'] = mails
data_words['words_per_mail'] = data_words['total_words'] / data_words['mail_number']

# Sort the dataframe by words per mail, to see which mails are more verbose
data_wpm_sort = data_words.sort_values(by='words_per_mail')
data_wpm_sort