In [2]:
import csv
import string
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')


# Function to read the CSV file and preprocess the data
def read_csv_and_preprocess(file_path):
    data = defaultdict(dict)

    # Define punctuation and stopwords
    punctuations = set(string.punctuation)
    stop_words = set(stopwords.words('english'))

    # Read CSV file
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            article_id = int(row['ARTICLE_ID'])
            section_text = row['SECTION_TEXT']

            # Tokenize and preprocess section text
            tokens = word_tokenize(section_text.lower())
            filtered_tokens = [token for token in tokens if token not in stop_words and token not in punctuations]

            # Store preprocessed data in dictionary
            data[article_id] = filtered_tokens

    return data

# Example usage
csv_file_path = 'sampled.csv'
preprocessed_data = read_csv_and_preprocess(csv_file_path)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bilal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bilal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Function to calculate word count and word frequency for each article
def calculate_word_stats(data):
    word_stats = defaultdict(dict)

    for article_id, tokens in data.items():
        word_count = len(tokens)
        word_frequency = defaultdict(int)
        for word in tokens:
            word_frequency[word] += 1
        word_stats[article_id] = {'word_count': word_count, 'word_frequency': word_frequency}


    return word_stats

# Example usage
word_stats = calculate_word_stats(preprocessed_data)
word_stats


defaultdict(dict,
            {55627: {'word_count': 23,
              'word_frequency': defaultdict(int,
                          {'delavan': 3,
                           'founded': 1,
                           'group': 1,
                           'settlers': 1,
                           'new': 2,
                           'england': 1,
                           'city': 1,
                           'derives': 1,
                           'name': 1,
                           'edward': 1,
                           'c.': 1,
                           'temperance': 1,
                           'advocate': 1,
                           'albany': 1,
                           'york': 1,
                           'post': 1,
                           'office': 1,
                           'operation': 1,
                           'since': 1,
                           '1840': 1})},
             63340: {'word_count': 28,
              'word_frequency': defaultdict(int,
       

In [5]:
from collections import Counter

def perform_apriori_analysis(word_stats):
    # Perform apriori analysis here
    # Here, let's find the most frequent words across all articles
    
    # Combine word frequencies from all articles
    all_word_freq = Counter()
    for stats in word_stats.values():
        all_word_freq.update(stats['word_frequency'])
    
    # Find frequent words (e.g., those that appear in at least 3 articles)
    frequent_words = {word: freq for word, freq in all_word_freq.items() if freq >= 3}
    
    # Store results in a dictionary
    apriori_results = {
        'frequent_words': frequent_words
    }
    
    return apriori_results

# Example usage
apriori_results = perform_apriori_analysis(word_stats)
print("Apriori analysis results:")
print(apriori_results)


Apriori analysis results:
{'frequent_words': {'delavan': 3, 'group': 3, 'new': 25, 'city': 5, 'c.': 4, 'york': 5, 'since': 4, "''": 114, 'township': 4, 'county': 3, 'united': 3, 'states': 3, 'state': 3, '``': 68, 'kragerø': 3, 'municipality': 4, '1': 3, 'january': 3, 'one': 14, "'s": 26, 'port': 4, 'along': 3, 'also': 10, 'august': 5, 'killed': 4, 'service': 7, 'former': 4, 'time': 7, 'known': 4, 'two': 12, 'three': 4, 'primary': 4, 'vesicles': 3, 'formed': 3, 'second': 5, 'become': 8, 'third': 3, 'byng': 4, 'born': 6, 'family': 6, 'son': 3, 'daughter': 3, 'enter': 3, 'form': 5, 'first': 10, 'received': 4, 'poor': 3, 'later': 5, 'died': 5, 'leszczyński': 3, 'western': 5, 'australian': 4, 'maritime': 5, 'museum': 13, '2006': 4, 'oberon': 11, 'survived': 3, 'preserved': 6, 'converted': 3, 'partially': 3, 'another': 5, 'awaiting': 3, 'work': 4, 'navy': 9, 'submarines': 6, 'moved': 5, 'towed': 3, 'located': 4, 'island': 3, 'british': 3, 'vessel': 4, 'australia': 3, 'six': 4, 'national': 3,

In [8]:
# Function to calculate Jaccard similarity between two sets
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    similarity = len(intersection) / len(union)
    return similarity

# Function to find relevant articles based on user query
def find_relevant_articles(query, word_stats):
    relevant_articles = []

    # Preprocess query
    query_tokens = word_tokenize(query.lower())
    query_set = set(query_tokens)

    # Iterate through word stats of each article
    for article_id, stats in word_stats.items():
        article_tokens = set(stats['word_frequency'].keys())
        similarity = jaccard_similarity(query_set, article_tokens)
        if similarity >= 0.5:
            relevant_articles.append(article_id)

    return relevant_articles

# Example usage
user_query = input("Enter your query: ")
relevant_articles = find_relevant_articles(user_query, word_stats)
print("Relevant articles based on the query:")
for article_id in relevant_articles:
    print(f"Article {article_id}")


Relevant articles based on the query:
