# Exploratory Analysis

### 1- Extracting the education-related words from Unesco thesaurus (education group)

In [None]:
# Import necessary libraries
import requests
import pandas as pd

In [28]:
# Fetch the JSON file directly from URL
educationGlossaryURL = "https://id.loc.gov/authorities/subjects/sh85040989.json"
URLResponse = requests.get(educationGlossaryURL)
educationGlossaryData = URLResponse.json()

#Collect all the @value entries from the JSON, as education-related words are encoded within.
education_related_terms = []
for entry in educationGlossaryData:
    for key,values in entry.items():
        if isinstance(values, list):
            for value in values:
                if isinstance(value, dict) and "@value" in value:
                    education_related_terms.append(value["@value"])
# Build a dataframe of unique terms
df_education_related_terms = pd.DataFrame(education_related_terms, columns = ["Education-related Key Word"])
print(df_education_related_terms.head(20))
print(df_education_related_terms.tail(20))
print(df_education_related_terms.size)

                      Education-related Key Word
0                     Human resource development
1                          Pictures in education
2                          Pictures in education
3   150  $aEducation$xAustralian states$xFinance
4          Education--Australian states--Finance
5          Education--Australian states--Finance
6                            Inclusive education
7                            Inclusive education
8           150  $aEducation$xEconometric models
9                  Education--Econometric models
10                 Education--Econometric models
11             150  $aEducation$xReference books
12                    Education--Reference books
13                    Education--Reference books
14                               Home and school
15                               Home and school
16                        Compensatory education
17                        Compensatory education
18                       Libraries and education
19                  

In [None]:
# Remove the duplicates
df_education_related_terms_clean = df_education_related_terms.drop_duplicates()
print(df_education_related_terms_clean.head(20))
print(df_education_related_terms_clean.size)

In [None]:
#Check whether any null field exists
print(f"Null entry exists? {df_education_related_terms_clean.isnull().values.any()}")

In [None]:
#Remove the entries start with digits
df_education_related_terms_clean = df_education_related_terms_clean[~df_education_related_terms_clean["Education-related Key Word"].str[0].str.isdigit()]
print(df_education_related_terms_clean.head(20))
print(df_education_related_terms_clean.size)


In [None]:
# Replace the double dash ('--') with white spaces
df_education_related_terms_clean["Education-related Key Word"] = df_education_related_terms_clean["Education-related Key Word"].str.replace("--", " ")
print(df_education_related_terms_clean.head(20))
print(df_education_related_terms_clean.size)

In [None]:
# Normalize all the whitespaces (i.e., only one space between words) & lowercase all entries
df_education_related_terms_clean["Education-related Key Word"] = df_education_related_terms_clean["Education-related Key Word"].str.replace(r"\s", " ", regex=True).str.strip().str.lower()
print(df_education_related_terms_clean.head(20))
print(df_education_related_terms_clean.size)


In [31]:
# Convert all chacters to unicode? Keep like this for now until you see the UN corpus.

## 2- Loading speeches text to table