In [None]:
# (STEP 1) Count Phrase Occurances

## NEED: method to determine which phrases to keep track of

## METHOD: develop and create a frequency matrix with all of our articles
## and tracking the frequency of phrases

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Load articles
articles_df = pd.read_csv('10k_articles_formatted.csv')
articles = articles_df['content'].astype(str).apply(lambda x: x.lower().replace('period', '.'))

# Load phrases
phrases_df = pd.read_csv('Gensim_Python_Notebook_Results\\remaining_phrases.csv')
phrases = phrases_df['phrase'].tolist()

# Initialize CountVectorizer with n-gram range
vectorizer = CountVectorizer(ngram_range=(1, 3))

# Fit the vectorizer to the articles and transform the data
frequency_matrix = vectorizer.fit_transform(articles)

# Get the feature names that match your phrases
matched_phrases = [phrase for phrase in phrases if phrase in vectorizer.get_feature_names_out()]

# Create a DataFrame with only the columns for your phrases
frequency_df = pd.DataFrame(frequency_matrix.toarray(), columns=vectorizer.get_feature_names_out())
frequency_df = frequency_df[matched_phrases]

# Optional: Set newspaper names as the index
frequency_df.index = articles_df['name']

frequency_df.to_csv('frequency_matrix.csv', index=False)


# Display the first few rows of the frequency matrix
print(frequency_df.head())

KeyboardInterrupt: 

In [5]:
#### TESTING SCRIPT

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Assuming articles_df and phrases_df are loaded from somewhere before this point

# Sample a subset of the data for testing
sample_articles_df = articles_df.sample(n=200, random_state=42) # Ensure reproducibility with a fixed random state
sample_phrases_df = phrases_df.sample(n=50, random_state=42) 
phrases = sample_phrases_df['phrase'].tolist()

# Process articles
sample_articles = sample_articles_df['content'].astype(str).apply(lambda x: x.lower().replace('period', '.'))

# Initialize CountVectorizer with n-gram range
vectorizer = CountVectorizer(ngram_range=(1, 3))

# Fit the vectorizer to the sample articles and transform the data
frequency_matrix = vectorizer.fit_transform(sample_articles)

# Get the feature names that match your phrases
matched_phrases = [phrase for phrase in phrases if phrase in vectorizer.get_feature_names_out()]

# Create a DataFrame with only the columns for your phrases
frequency_df = pd.DataFrame(frequency_matrix.toarray(), columns=vectorizer.get_feature_names_out())
frequency_df = frequency_df[matched_phrases]

# Optional: Set newspaper names as the index from the sample
frequency_df.index = sample_articles_df['name']

# Save the test frequency matrix to a CSV file
frequency_df.to_csv('test_frequency_matrix.csv', index=False)

# Display the first few rows of the frequency matrix
print(frequency_df.head(100))

                                    tennis  restaurants  world series  \
name                                                                    
SUNY Stony Brook                         0            0             0   
Capital University                       0            0             0   
University of Pittsburgh                 0            0             0   
University of Missouri                   0            1             0   
University of Utah                       0            0             0   
...                                    ...          ...           ...   
Pennsylvania State University            0            0             0   
Eastern Illinois University              0            0             0   
University of Oklahoma                   0            0             0   
University of California San Diego       0            0             0   
Washington State University              0            0             0   

                                    swimming  tax 

In [None]:
# (STEP 2) Use Poisson Distribution for Analysis
## Use a Poisson Distribution to accurately keep track of our
## frequency data (Nij)

import numpy as np

# Variables
num_articles = 10000  
num_keywords = 366    
lambda_per_article = 0.5  # Average rate (λ) of keyword occurrence per article, to be adjusted

# Simulate occurrences of each keyword in each article
# AForsimplicity, we assume each keyword has the same λ across all articles
keyword_occurrences = np.random.poisson(lambda_per_article, (num_articles, num_keywords))

print(f"Shape of occurrences matrix: {keyword_occurrences.shape}")
print(f"Total occurrences of the first keyword across all articles: {keyword_occurrences[:, 0].sum()}")

In [None]:
# (STEP 3) SVD application and (STEP 4) Modifying the SVD to address
# shortcomings relating to the specific nature of this project

In [None]:
# (STEP 5) Data tranformation using the Poisson Likelihood function
# (the log of) to best explain observed phrase counts

## PURPOSE: Adjusts the model to enable the predicted phrase counts to closely
## mirror the actual phrase counts

In [None]:
# (STEP 6) -> Gradient Descent for data optimization
# (STEP 7) -> Getting rid of negative counts