In [13]:
import seaborn as sn
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import pandas as pd
import os
import json


def preprocess(newspaper="sun"):
    """
        Preprocesses text data from JSON files for three different newspapers (The Times, The Sun, and The Guardian),
        including tokenisation, removal of stopwords and punctuation, part-of-speech tagging, and lemmatisation.
        Returns a Pandas DataFrame containing the preprocessed data.

        Parameters:
        -----------
        newspaper : str, default="sun"
            Name of the newspaper to preprocess data for. Must be one of "times", "sun", or "guardian".

        Raises:
        -------
        ValueError:
            If the 'newspaper' argument is not a string or is not one of the supported newspapers.

        Returns:
        --------
        pandas.DataFrame:
            DataFrame containing the preprocessed text data. The DataFrame has columns for the original article content,
            the preprocessed text, and additional columns for the sentences, tokens, part-of-speech tags, and lemmas.
    """

    if not isinstance(newspaper, str):
        raise ValueError('newspaper argument must be a string')
    newspaper = newspaper.lower()
    if newspaper not in ["times", "sun", "guardian"]:
        raise ValueError('newspaper argument must be one of "times", "sun", or "guardian"')

    if newspaper == "times":
        df = pd.read_json('times_articles.json')
        # content = df.loc[:, "content"]

    elif newspaper == "sun":
        df = pd.read_json('sun_articles.json')
        # content = df.loc[:, "content"]

    elif newspaper == "guardian":
        # for the guardian
        # Create an empty list to store the contents of each JSON file
        json_data = []
        # Set the path to the directory containing the JSON files
        path_to_json_files = "crawler/guardian_articles"
        # Loop through each file in the directory
        for filename in os.listdir(path_to_json_files):
            if filename.endswith(".json"):
                # Open the file and load its contents as a JSON object
                with open(os.path.join(path_to_json_files, filename)) as f:
                    data = json.load(f)
                    # Append the contents to the list
                    json_data.append(data["response"]["results"])

        new_dict_list = []
        for item in json_data:
            for article in item:
                new_dict = {
                    "title": article["webTitle"],
                    "date": article["webPublicationDate"],
                    "content": article["fields"]["body"]
                }
                new_dict_list.append(new_dict)
        # convert the list of dictionaries to a JSON string
        # json_string = json.dumps(new_dict_list)

        # print the JSON string
        # print(json_string)
        df = pd.DataFrame(new_dict_list)

    else:
        raise ValueError('Input newspaper is not processable')

    # preprocessing starts here
    df['sentences'] = df['content'].apply(lambda x: nltk.sent_tokenize(x))
    # Tokenize each document into words and remove punctuation
    df['tokens'] = df['content'].apply(lambda x: [word.lower() for word in word_tokenize(x) if word not in punctuation])
    # Remove stopwords
    stopwords_list = stopwords.words('english')
    df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word not in stopwords_list])
    # Remove symbols
    df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word.isalpha()])

    df['pos_tags'] = df['tokens'].apply(lambda x: nltk.pos_tag(x))
    # Lemmatise each token
    lemmatiser = WordNetLemmatizer()
    df['lemmas'] = df['tokens'].apply(lambda x: [lemmatiser.lemmatize(token) for token in x])
    # Convert the list of lemmas back to text
    df['lemmatised_text'] = df['lemmas'].apply(lambda x: ' '.join(x))

    return df


def df_to_dtm(df):
    """
       Convert a pandas DataFrame of preprocessed text data (obtained using preprocessing() ) into a Document-Term Matrix
       (DTM) using a CountVectorizer.

       Parameters:
           df (pandas DataFrame): A DataFrame containing preprocessed text data, including a column named 'lemmatised_text'
                                  containing the preprocessed text data as strings.

       Returns:
           pandas DataFrame: A DataFrame representation of the DTM with document IDs as the index and individual terms as
                             columns. The cells of the DataFrame contain the term frequencies (counts) for each document.
    """

    # Create a CountVectorizer object
    vectoriser = CountVectorizer()
    dtm = vectoriser.fit_transform(df['lemmatised_text'])
    # Create a dataframe from the DTM
    df_dtm = pd.DataFrame(dtm.toarray(), columns=vectoriser.get_feature_names_out())
    # Add the original text column back to the dataframe
    df_dtm['content'] = df['content']

    return df_dtm


def df_to_tfidf(df):
    """
        Convert a pandas DataFrame of preprocessed text data (obtained using preprocessing() ) into a Term Frequency-Inverse
        Document Frequency (TF-IDF) matrix using a CountVectorizer and a TfidfTransformer.

        Parameters:
            df (pandas DataFrame): A DataFrame containing preprocessed text data, including a column named 'lemmatised_text'
                                   containing the preprocessed text data as strings.

        Returns:
            pandas DataFrame: A DataFrame representation of the TF-IDF matrix with document IDs as the index and individual
                              terms as columns. The cells of the DataFrame contain the TF-IDF scores for each document.
    """

    vectoriser = CountVectorizer()
    dtm = vectoriser.fit_transform(df['lemmatised_text'])
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(dtm)
    # Create a dataframe from the TF-IDF matrix
    df_tfidf = pd.DataFrame(tfidf.toarray(), columns=vectoriser.get_feature_names_out())
    df_tfidf = df_tfidf[df_tfidf.sum().sort_values(ascending=False).index]
    # Add the original text column back to the dataframe
    df_tfidf['content'] = df['content']
    pd.set_option('display.max_columns', 100)
    # Print the resulting dataframe

    return df_tfidf

dataframe = preprocess("guardian")
dtm_dataframe = df_to_dtm(dataframe)
#print(dtm_dataframe)
print(df_to_tfidf(dataframe))

corpus = df_to_tfidf(dataframe)


def build_co_occurrence_matrix(corpus, window_size):
    # build unique words
    unique_words = set()
    for text in corpus:
        for word in word_tokenize(text):
            unique_words.add(word)

    word_search_dict = {word: np.zeros(shape=(len(unique_words))) for word in unique_words}
    word_list = list(word_search_dict.keys())
    for text in corpus:
        text_list = word_tokenize(text)
        for idx, word in enumerate(text_list):
            # pick word in the size range
            i = max(0, idx - window_size)
            j = min(len(text_list) - 1, idx + window_size)
            search = [text_list[idx_] for idx_ in range(i, j + 1)]
            search.remove(word)
            for neighbor in search:
                # get neighbor idx in word_search_dict
                nei_idx = word_list.index(neighbor)
                word_search_dict[word][nei_idx] += 1
    return word_search_dict

In [17]:
print(df_to_tfidf(dataframe))

         world       cup      said   england      game      team    player  \
0     0.047536  0.046775  0.039605  0.000000  0.000000  0.013709  0.000000   
1     0.093164  0.073338  0.020699  0.000000  0.000000  0.000000  0.000000   
2     0.090707  0.119007  0.083971  0.000000  0.017625  0.000000  0.000000   
3     0.046395  0.045652  0.000000  0.000000  0.010818  0.000000  0.000000   
4     0.074733  0.073536  0.066416  0.000000  0.000000  0.000000  0.018234   
...        ...       ...       ...       ...       ...       ...       ...   
2878  0.034102  0.016778  0.028413  0.012596  0.000000  0.009834  0.010401   
2879  0.038818  0.025464  0.028748  0.000000  0.090509  0.059704  0.047357   
2880  0.048427  0.047651  0.013449  0.000000  0.056457  0.069828  0.014770   
2881  0.042987  0.014100  0.095507  0.042340  0.033410  0.000000  0.087405   
2882  0.041681  0.010253  0.046303  0.000000  0.000000  0.000000  0.000000   

           one  football      time      year       say     woul

In [24]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
# Convert a collection of text documents to a matrix of token counts
cv = CountVectorizer(ngram_range=(1,1), stop_words = 'english')
X = cv.fit_transform(corpus)
Xc = (X.T * X) # matrix manipulation
Xc.setdiag(0)
names = cv.get_feature_names_out() # This are the entity names (i.e. keywords)
df = pd.DataFrame(data = Xc.toarray(), columns = names, index = names)
print(df)

MemoryError: Unable to allocate 18.8 GiB for an array with shape (50297, 50297) and data type int64

In [27]:
def build_vocabulary(page:list) -> list:
    '''
    Builds vocabulary with all the words
    present in the list page.
    '''
    vocab = list(set(page))
    vocab.sort()

    vocab_dict = {}
    for index, word in enumerate(vocab):
        vocab_dict[word] = index
    return vocab_dict

In [29]:
def build_context(
    page:str,
    co_occurrence_vectors: pd.DataFrame
) -> pd.DataFrame:

    for index, element in enumerate(page):
        # Build start and finish of context
        start = 0 if index-2 < 0 else index-2
        finish = len(page) if index+2 > len(page) else index+3
        # Retrieve Context for word
        context = page[start:index]+page[index+1:finish]
        for word in context:
            # Update Co-Occurrence Matrix
            co_occurrence_vectors.loc[element, word] = (
                co_occurrence_vectors.loc[element, word]+1
            )

    return co_occurrence_vectors


In [32]:
vocab_dict = build_vocabulary(corpus)

co_occurrence_vectors = pd.DataFrame(
    np.zeros([len(vocab_dict), len(vocab_dict)]),
    index = vocab_dict.keys(),
    columns = vocab_dict.keys()
)

co_occurrence_vectors = build_context(
  corpus,
  co_occurrence_vectors
)

similarity_words = pd.DataFrame(
    cosine_similarity(co_occurrence_vectors),
    columns = vocab_dict.keys(),
    index = vocab_dict.keys()
)

# Example of Top 10 words by similarity
similarity_words.loc['qatar'].sort_values(ascending=False).head(10)

MemoryError: Unable to allocate 19.0 GiB for an array with shape (50506, 50506) and data type float64

In [26]:
corpus.dtypes

world         float64
cup           float64
said          float64
england       float64
game          float64
               ...   
denselow      float64
ameliorate    float64
ararat        float64
depict        float64
sassanid      float64
Length: 50506, dtype: object

In [15]:
coo_dict=build_co_occurrence_matrix(corpus,window_size=4)

MemoryError: Unable to allocate 395. KiB for an array with shape (50506,) and data type float64

In [10]:
print(pd.DataFrame(coo_dict,index=coo_dict.keys()).astype('int'))

MemoryError: Unable to allocate 9.50 GiB for an array with shape (2550856036,) and data type int32

In [11]:
dataframe = pd.DataFrame(coo_dict,index=coo_dict.keys()).astype('int')

MemoryError: Unable to allocate 19.0 GiB for an array with shape (50506, 50506) and data type float64

In [14]:
sn.heatmap(dataframe)

ValueError: could not convert string to float: 'Qatar World Cup whistleblower was tortured, claims family'