In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np
import spacy
import pandas as pd

def extract_keywords_nmf(text, num_keywords=5):
    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
    tfidf_matrix = vectorizer.fit_transform([text])

    # NMF Model
    nmf_model = NMF(n_components=num_keywords, init='random', random_state=42)
    nmf_model.fit(tfidf_matrix)

    # Get feature names (words) and their corresponding NMF component values
    feature_names = np.array(vectorizer.get_feature_names_out())
    nmf_components = nmf_model.components_

    # Sort the words within each NMF component by their importance
    sorted_words_indices = np.argsort(nmf_components, axis=1)[:, ::-1]

    # Extract the top N keywords for each NMF component
    top_keywords = [feature_names[component_indices][:num_keywords] for component_indices in sorted_words_indices]

    # Flatten the list of top keywords from all topics
    all_keywords = [keyword for topic_keywords in top_keywords for keyword in topic_keywords]

    # Calculate the frequency of each keyword in the text
    keyword_frequency = {}
    for keyword in all_keywords:
        if keyword not in keyword_frequency:
            keyword_frequency[keyword] = 1
        else:
            keyword_frequency[keyword] += 1

    # Sort keywords based on frequency in descending order
    sorted_keywords = sorted(keyword_frequency.items(), key=lambda x: x[1], reverse=True)
    sorted_keywords, sorted_frequencies = zip(*sorted_keywords)

   # Create a DataFrame
    df_keywords = pd.DataFrame({
        'Keyword': sorted_keywords,
        'Frequency': sorted_frequencies
    })

    return df_keywords



In [None]:
SUMMARIZED_SPEECH_DIRECTORY = './Data/Summarized Speech/'
import os
from collections import Counter
import pandas as pd
final_df = pd.DataFrame(columns=['FileName','Keyword','Count'])

for f in os.listdir(SUMMARIZED_SPEECH_DIRECTORY):
    file_path = SUMMARIZED_SPEECH_DIRECTORY+f
    with open(file_path,'r') as file:
        summary = file.read()
    
    num_keywords = 30  # Change this to the desired number of keywords
    df = extract_keywords_nmf(summary, num_keywords)
    df = df[df['Frequency']>2]

    
    df.columns = ['Keyword','Count']
    df['FileName'] = f
    df = df[['FileName','Keyword','Count']] 

    final_df = pd.concat([final_df,df])


final_df.to_csv('./Data/NMFKeyword.csv',sep='\t',index=False)    

SENTENCE EXTRACTION from Summarized Speech


In [40]:
SUMMARIZED_SPEECH_DIRECTORY = './Data/Summarized Speech/'


# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Load the CSV file containing keywords and their frequency
filtered_keywords = pd.read_csv("./Data/NMFKeyword.csv",delimiter='\t')
file_list = filtered_keywords['FileName'].unique().tolist()

df_filtered = pd.DataFrame()
for f in file_list:
    with open(SUMMARIZED_SPEECH_DIRECTORY+f, 'r') as file:
        text = file.read()

    # Process the text with spaCy
    doc = nlp(text)

    # Initialize an empty dictionary to store sentences containing each keyword
    keyword_sentences = {keyword: [] for keyword in filtered_keywords[filtered_keywords['FileName']==f]['Keyword']}

    # Search for sentences containing any of the filtered keywords
    for sentence in doc.sents:
        for keyword,_ in keyword_sentences.items():
            if keyword.lower() in sentence.text.lower():
                keyword_sentences[keyword].append(sentence.text)

    # Create a dictionary of sentences containing each filtered keyword
    data = {"Keyword": [], "Extracted Sentence": []}
    for keyword, sentences in keyword_sentences.items():
        for sentence in sentences:
            data["Keyword"].append(keyword)
            data["Extracted Sentence"].append(sentence)

    # Create a DataFrame from the extracted sentences
    df = pd.DataFrame(data)
    df_filtered = pd.concat([df_filtered,df])

    
df_filtered.to_csv( "./Data/Extracted_sentence.csv", index=False)


{'providing prompt': [], 'states hopes': [], 'need cope': [], '242 1967': [], 'arise': [], 'united nations': [], 'assembly': [], 'peaceful': [], 'security': [], 'hopes': [], 'program': [], 'cope': [], 'placed jeopardy': [], 'second make': [], 'lusaka declaration': [], 'donor nations': [], 'leader': [], 'spokesman': [], 'deficit': [], 'established united': [], 'practicable way': [], 'violation': [], 'self determination': [], 'requir tokyo': [], 'eps': [], 'planning future': [], 'tensions arise': [], 'settle exists': [], 'moving': [], 'directly': [], 'fields': [], 'mankind lived': []}
{'assembly': [], 'nations': [], 'canal': [], 'capitals': [], 'arrest': [], 'efforts united': [], 'people pakistan': [], 'weaken': [], 'continuing deterioration': [], 'states': [], 'peace': [], 'united states': [], 'china': [], 'united nations': [], 'years': [], 'republic china': [], 'relations': [], 'people republic': [], 'suez canal': [], 'president': [], 'israeli withdrawal': [], 'course': [], 'pragmatism