# Weekly Entry Analyzer

## Extract, Clean and Expand the Data

### Extraction of the weekly entry data from pdf files

In [None]:
import pdfplumber
import os
import pandas as pd

# Path to the directory containing PDFs
pdf_folder = './weekly_entries'

# List to hold data
data = []

# Loop through all PDF files in the directory
for filename in os.listdir(pdf_folder):
    if filename.endswith('.pdf'):
        # Construct full file path
        file_path = os.path.join(pdf_folder, filename)
        # Extract date from filename
        date = filename.split('.')[0]
        with pdfplumber.open(file_path) as pdf:
            full_text = ''
            for page in pdf.pages:
                full_text += page.extract_text() or ''  # Extract text, add empty if none found
            data.append({'date': date, 'text': full_text})

# Create a DataFrame
df = pd.DataFrame(data)
display(df.head())


### Creating the sentiment, main topic, main keyword and character count of the weekly entries

#### Get the main topic

In [None]:
import openai

def get_main_topic(text):
    openai.api_key = "" # Your OpenAI API key here
    messages = [
        {
            "role": "system",
            "content": "This is a conversation with an AI."
        },
        {
            "role": "user",
            "content": f"What is the main topic of this text: \"{text}\"?"
        }
    ]
    try:
        response = openai.chat.completions.create(model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=60)
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return "Error retrieving topic"
    
# Get the main topics and add them to the DataFrame
df['main_topic'] = df['text'].apply(get_main_topic)
display(df.head())

#### Get the sentiment

In [None]:
from textblob import TextBlob

def analyze_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Adding sentiment to the DataFrame
df['sentiment'] = df['text'].apply(analyze_sentiment)

# Display the first few entries to verify
display(df.head())

#### Get the main keyword

In [None]:
def get_most_important_keyword(main_topic):
    openai.api_key = '' # Your OpenAI API key here
    messages = [
        {
            "role": "system",
            "content": "This is a conversation with an AI."
        },
        {
            "role": "user",
            "content": f"Select a keyword that describes the best the following topic (just output the actual keyword): \"{main_topic}\""
        }
    ]
    try:
        response = openai.chat.completions.create(model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=10)
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"An error occurred: {e}")
        return "Error retrieving topic"
    
df['main_keyword'] = df['main_topic'].apply(get_most_important_keyword)
display(df.head())


#### Get the character count

In [None]:
df['text_length'] = df['text'].str.len()
display(df.head())

### Format date and sort it from oldest to newest

In [None]:
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')
df = df.sort_values('date')
display(df.head())

## Calculations on the data

### Calculate the main statistical indicators for the sentiment

In [None]:
import pandas as pd

# Calculate the mean of the sentiment variable
mean_sentiment = df['sentiment'].mean()

# Calculate the median of the sentiment variable
median_sentiment = df['sentiment'].median()

#Calculate the standard deviation of the sentiment variable
std_sentiment = df['sentiment'].std()

#min
min_sentiment = df['sentiment'].min()

#max
max_sentiment = df['sentiment'].max()

#Calculate the range of the sentiment variable
range_sentiment = df['sentiment'].max() - df['sentiment'].min()

#Calculate the 25th percentile of the sentiment variable
percentile_25 = df['sentiment'].quantile(0.25)

#Calculate the 75th percentile of the sentiment variable
percentile_75 = df['sentiment'].quantile(0.75)

#make it into a table
summary = pd.DataFrame({'mean': [mean_sentiment], 'median': [median_sentiment], 'std': [std_sentiment], 'min': [min_sentiment], "max": [max_sentiment], 'range': [range_sentiment], '25th percentile': [percentile_25], '75th percentile': [percentile_75]})

display(summary)

## Visualize the Data

### Visualize the change in sentiment over time

In [None]:
import matplotlib.pyplot as plt

# Set the figure size
plt.figure(figsize=(12, 6))

# Plot the sentiment values over time
plt.plot(df['date'], df['sentiment'])

# Set the x-axis label
plt.xlabel('Date')

# Set the y-axis label
plt.ylabel('Sentiment')

# Set the title
plt.title('Change in Sentiment over Time')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45)

# Display the plot
plt.show()

### Visualize the average sentiment per main keyword

In [None]:
import matplotlib.pyplot as plt

# Plotting the average sentiment per main keyword
plt.figure(figsize=(12, 6))
plt.bar(average_sentiment.index, average_sentiment)
plt.xlabel('Main Keyword')
plt.ylabel('Average Sentiment')
plt.title('Average Sentiment per Main Keyword')
plt.xticks(rotation=45, ha='right')  # Rotate the x-axis labels
plt.tight_layout()  # Adjust the layout to prevent label overlapping

# Increase the space between keyword labels
plt.xticks(range(len(average_sentiment.index)), average_sentiment.index, fontsize=8)

plt.show()

### Visualize the keyword frequency

In [None]:
import matplotlib.pyplot as plt

# Get the keyword frequency
keyword_frequency = df['main_keyword'].value_counts()

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.bar(keyword_frequency.index, keyword_frequency)
plt.xlabel('Keyword')
plt.ylabel('Frequency')
plt.title('Keyword Frequency')
plt.xticks(rotation=45, ha='right')  # Rotate the x-axis labels
plt.tight_layout()  # Adjust the layout to prevent label overlapping

# Increase the space between keyword labels
plt.xticks(range(len(keyword_frequency.index)), keyword_frequency.index, fontsize=8)

plt.show()

### Visualize the total characters written on keywords

In [None]:
total_characters = sum(len(keyword) for keyword in keyword_frequency.index)
import matplotlib.pyplot as plt

# Calculate the total characters for each keyword
keyword_lengths = [len(keyword) for keyword in keyword_frequency.index]

# Create a bar plot
plt.figure(figsize=(12, 6))
plt.bar(keyword_frequency.index, keyword_lengths)
plt.xlabel('Keyword')
plt.ylabel('Total Characters')
plt.title('Total Characters Written on Keywords')
plt.xticks(rotation=45, ha='right')  # Rotate the x-axis labels
plt.tight_layout()  # Adjust the layout to prevent label overlapping

# Increase the space between keyword labels
plt.xticks(range(len(keyword_frequency.index)), keyword_frequency.index, fontsize=8)

plt.show()
print(total_characters)

### Visualize the correlation between a scaled version of the sentiment and the character count

In [None]:
import matplotlib.pyplot as plt

# Assuming df is your DataFrame and min_sentiment, max_sentiment are defined
# Extract the relevant columns from the DataFrame
df_plot = df[['date', 'sentiment', 'text_length']]

# Calculate the max of text_length
max_text_length = df_plot['text_length'].max()

# Scale the sentiment to match the scale of text_length
df_plot['scaled_sentiment'] = ((df_plot['sentiment'] - min_sentiment) / (max_sentiment - min_sentiment)) * max_text_length

# Set the figure size
plt.figure(figsize=(12, 6))

# Plot the character count and the scaled sentiment
plt.plot(df_plot['date'], df_plot['text_length'], label='Characters')
plt.plot(df_plot['date'], df_plot['scaled_sentiment'], label='Scaled Sentiment')

# Set the labels and title
plt.xlabel('Date')
plt.ylabel('Value')
plt.title('Correlation between Characters and Scaled Sentiment')
plt.legend()

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45)

# Show the plot
plt.show()


## Data Storage

### Store the data in a csv file

In [None]:
df.to_csv('entry_analysis_results.csv', index=False)