In [None]:
import os
import pdfplumber
import pandas as pd
import re

# Function to extract specific text from PDF
def extract_info_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    
    # Define regex patterns for information extraction
    name_pattern = r"PERSONNE ETROITEMENT LIEE :(.*?)(?=,|$)"
    position_pattern = r",(.*?)(?=NOTIFICATION|$)"
    company_name_pattern = r"NOM :(.*?)(?=LEI|DETAIL|$)" # LEI or DETAIL to stop
    date_pattern = r"DATE DE LA TRANSACTION :(.*?)(?=LIEU|$)"
    nature_pattern = r"NATURE DE LA TRANSACTION :(.*?)(?=DESCRIPTION|$)"
    price_pattern = r"PRIX :(.*?)(?=Euro|Dollar|Livre|Franc|$)"
    volume_pattern = r"VOLUME :((?:(?!VOLUME :|TRANSACTION).)*)\s*TRANSACTION" # last occurence
    date_notification_pattern = r"DATE DE RECEPTION DE LA NOTIFICATION :(.*?)(?=COMMENTAIRES|$)"

    name = re.search(name_pattern, text, re.DOTALL)
    position = re.search(position_pattern, text, re.DOTALL)
    company_name = re.search(company_name_pattern, text, re.DOTALL)
    date_transaction = re.search(date_pattern, text, re.DOTALL)
    nature = re.search(nature_pattern, text, re.DOTALL)
    price = re.search(price_pattern, text, re.DOTALL)
    volume = re.search(volume_pattern, text, re.DOTALL)
    date_notification = re.search(date_notification_pattern, text, re.DOTALL)

    # Extracted information
    def convert_to_text(re_search_result):
        return re_search_result.group(1).strip() if re_search_result else ""
    
    name_text = convert_to_text(name)
    position_text = convert_to_text(position)
    company_name_text = convert_to_text(company_name)
    date_transaction_text = convert_to_text(date_transaction)
    nature_text = convert_to_text(nature)
    price_text = convert_to_text(price)
    volume_text = convert_to_text(volume)
    date_notification_text = convert_to_text(date_notification)

    text_dict = {'name': name_text, 'position': position_text, 'company_name': company_name_text,
                 'date_transaction': date_transaction_text, 'date_notification': date_notification_text,
                 'nature': nature_text, 'price': price_text, 'volume': volume_text}
    
    return text_dict

folder_path = "./amf-pdfs/"

# Find all PDF files in the folder
pdf_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".pdf")]
print(f'list of PDF files: {pdf_files}')
total_files = len(pdf_files)
print(f'number of PDF files: {total_files}')

data = []

# Extract information from each PDF and store it in the dataframe
for i, file in enumerate(pdf_files, 1):
    print(f'\r {i}/{total_files}')
    new_info_dict = extract_info_from_pdf(file)
    data.append(new_info_dict)
df = pd.DataFrame(data)

df.head(10)

In [None]:
from datetime import datetime
import locale

# Set the French locale for month names
locale.setlocale(locale.LC_TIME, 'fr_FR.UTF-8')

# Convert the 'date' column to datetime format
df['date_transaction'] = pd.to_datetime(df['date_transaction'], format='%d %B %Y', errors='coerce')
df['date_notification'] = pd.to_datetime(df['date_notification'], format='%d %B %Y', errors='coerce')

# Reset the locale to the default setting
locale.setlocale(locale.LC_TIME, '')

df.head(10)


In [None]:
# Ensure Price and Volume are Float values
df['price'] = df['price'].astype(str).str.replace(' ', '').astype(float)
df['volume'] = df['volume'].astype(str).str.replace(' ', '').astype(float)
df.tail(10)

In [None]:
# company_names = df['company_name'].unique()

# ticker_mapping = {}

# for company_name in company_names:
#     ticker_mapping[company_name] = '.PA'

# ticker_mapping = dict(sorted(ticker_mapping.items()))

# print(ticker_mapping)

import ticker_mapping
ticker_mapping = ticker_mapping.ticker_mapping

In [None]:
company_names = df['company_name'].unique()
missing_keys = [company_name for company_name in company_names if company_name not in ticker_mapping]
print(f'missing keys: {missing_keys}')

df['ticker'] = df['company_name'].map(ticker_mapping)

df.head(15)

In [None]:
import yfinance as yf
from datetime import timedelta

# Get current and future stock prices and variations
def get_stock_prices(row):
    try:
        ticker = row['ticker']
        start_day = row['date_transaction']
        end_day = start_day + timedelta(days=90)
        start_day = start_day.strftime('%Y-%m-%d')
        end_day = end_day.strftime('%Y-%m-%d')

        stock = yf.Ticker(ticker)
        stock_data = stock.history(start=start_day, end=end_day)
        prices = stock_data['Open']
        prices_dict = {
            'stock_price_open_d0': prices.values[0],
            'var_d1': prices.values[1] / prices.values[0],
            'var_d3': prices.values[3] / prices.values[0],
            'var_d5': prices.values[5] / prices.values[0],
            'var_d10': prices.values[10] / prices.values[0],
            'var_d20': prices.values[20] / prices.values[0],
            'var_d30': prices.values[30] / prices.values[0],
            'var_d60': prices.values[60] / prices.values[0]
        }
        return pd.Series(prices_dict)
    except:
        return {
            'stock_price_open_d0': float('nan'),
            'var_d1': float('nan'),
            'var_d3': float('nan'),
            'var_d5': float('nan'),
            'var_d10': float('nan'),
            'var_d20': float('nan'),
            'var_d30': float('nan'),
            'var_d60': float('nan')
        }

# Applying the function to new columns
df = pd.concat([df, df.apply(get_stock_prices, axis=1)], axis=1)

df.head(15)

df.to_pickle('amf-data.pkl')

In [None]:
import pandas as pd

# Test if cession and acquisition future variation are going up or down
amf_data = pd.read_pickle('amf-data.pkl')
amf_df = pd.DataFrame(amf_data)

amf_df = amf_df[(amf_df['date_transaction'] >= '2022-01-01') & (amf_df['date_transaction'] <= '2022-04-30')]

amf_df = amf_df[amf_df['price'] != 0]

amf_df = amf_df[amf_df['position'].isin([
    'Président Directeur Général', 'PDG',
    'Président-Directeur Général', 'président-directeur général',
    'DIRECTEUR', 'Directeur Général', 'DIRIGEANT',
    'Directeur général', 'P.D.-G.'])]

print(f'data length: {len(amf_df)}')

df_cession = amf_df[amf_df['nature'] == 'Cession']
# df_acquisition = amf_df[amf_df['nature'] == 'Acquisition']
df_acquisition = amf_df[amf_df['nature'] != 'Cession']

var_means = {
    'cession': {
        'd1': df_cession['var_d1'].mean(),
        'd3': df_cession['var_d3'].mean(),
        'd5': df_cession['var_d5'].mean(),
        'd10': df_cession['var_d10'].mean(),
        'd20': df_cession['var_d20'].mean(),
        'd30': df_cession['var_d30'].mean(),
        'd60': df_cession['var_d60'].mean(),
    },
    'acquisition' : {
        'd1': df_acquisition['var_d1'].mean(),
        'd3': df_acquisition['var_d3'].mean(),
        'd5': df_acquisition['var_d5'].mean(),
        'd10': df_acquisition['var_d10'].mean(),
        'd20': df_acquisition['var_d20'].mean(),
        'd30': df_acquisition['var_d30'].mean(),
        'd60': df_acquisition['var_d60'].mean(),
    }
}

var_medians = {
    'cession': {
        'd1': df_cession['var_d1'].median(),
        'd3': df_cession['var_d3'].median(),
        'd5': df_cession['var_d5'].median(),
        'd10': df_cession['var_d10'].median(),
        'd20': df_cession['var_d20'].median(),
        'd30': df_cession['var_d30'].median(),
        'd60': df_cession['var_d60'].median()
    },
    'acquisition' : {
        'd1': df_acquisition['var_d1'].median(),
        'd3': df_acquisition['var_d3'].median(),
        'd5': df_acquisition['var_d5'].median(),
        'd10': df_acquisition['var_d10'].median(),
        'd20': df_acquisition['var_d20'].median(),
        'd30': df_acquisition['var_d30'].median(),
        'd60': df_acquisition['var_d60'].median()
    }
}

var_means = {key: {inner_key: round(inner_value, 2) for inner_key, inner_value in value.items()} for key, value in var_means.items()}
var_medians = {key: {inner_key: round(inner_value, 2) for inner_key, inner_value in value.items()} for key, value in var_medians.items()}

print(var_means)
print(var_medians)

In [None]:
import pandas as pd

# Test if cession and acquisition future variation are going up or down
amf_data = pd.read_pickle('amf-data.pkl')
amf_df = pd.DataFrame(amf_data)
amf_df = amf_df.sort_values(by='var_d5', ascending=False)

pivot_table = pd.pivot_table(amf_df, values=['var_d1', 'var_d3', 'var_d5', 'var_d10', 'var_d20', 'var_d30', 'company_name'],
                       index=['position', 'nature'], aggfunc={'var_d1': 'median',
                                                    'var_d3': 'median',
                                                    'var_d5': ['median', 'count'],
                                                    'var_d10': 'median',
                                                    'var_d20': 'median',
                                                    'var_d30': 'median',
                                                    'company_name': 'nunique'})
pivot_table = pivot_table.sort_values(by=('var_d3', 'median'), ascending=False)

pd.set_option('display.max_rows', None)  # Show all rows
# amf_df.head(200)
pivot_table.head(50)


In [None]:
pivot_table2 = pd.pivot_table(amf_df, values=['var_d1', 'var_d3', 'var_d5', 'var_d10', 'var_d20', 'var_d30'],
                       index=['company_name', 'nature'], aggfunc={'var_d1': 'median',
                                                    'var_d3': 'median',
                                                    'var_d5': ['median', 'count'],
                                                    'var_d10': 'median',
                                                    'var_d20': 'median',
                                                    'var_d30': 'median'})
pivot_table2 = pivot_table2.sort_values(by=('var_d5', 'median'), ascending=False)

pivot_table2.head(50)

In [None]:
pivot_table3 = pd.pivot_table(amf_df, values=['var_d1', 'var_d3', 'var_d5', 'var_d10', 'var_d20', 'var_d30'],
                       index=['name', 'company_name', 'nature'], aggfunc={'var_d1': 'median',
                                                    'var_d3': 'median',
                                                    'var_d5': ['median', 'count'],
                                                    'var_d10': 'median',
                                                    'var_d20': 'median',
                                                    'var_d30': 'median'})
pivot_table3 = pivot_table3.sort_values(by=('var_d5', 'median'), ascending=False)

pivot_table3.head(50)

In [None]:
selection_df = amf_df[amf_df['company_name'] == 'AMA CORPORATION PLC']
selection_df.sort_values(by='var_d3', ascending=False)
selection_df.head(30)