In [15]:
import os
import pdfplumber
import pandas as pd
import re

# Function to extract specific text from PDF
def extract_info_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    
    # Define regex patterns for information extraction
    name_pattern = r"PERSONNE ETROITEMENT LIEE :(.*?)(?=,|$)"
    position_pattern = r",(.*?)(?=NOTIFICATION|$)"
    company_name_pattern = r"NOM :(.*?)(?=LEI|DETAIL|$)" # LEI or DETAIL to stop
    date_pattern = r"DATE DE LA TRANSACTION :(.*?)(?=LIEU|$)"
    nature_pattern = r"NATURE DE LA TRANSACTION :(.*?)(?=DESCRIPTION|$)"
    price_pattern = r"PRIX :(.*?)(?=Euro|Dollar|Livre|Franc|$)"
    volume_pattern = r"VOLUME :((?:(?!VOLUME :|TRANSACTION).)*)\s*TRANSACTION" # last occurence
    date_notification_pattern = r"DATE DE RECEPTION DE LA NOTIFICATION :(.*?)(?=COMMENTAIRES|$)"

    name = re.search(name_pattern, text, re.DOTALL)
    position = re.search(position_pattern, text, re.DOTALL)
    company_name = re.search(company_name_pattern, text, re.DOTALL)
    date_transaction = re.search(date_pattern, text, re.DOTALL)
    nature = re.search(nature_pattern, text, re.DOTALL)
    price = re.search(price_pattern, text, re.DOTALL)
    volume = re.search(volume_pattern, text, re.DOTALL)
    date_notification = re.search(date_notification_pattern, text, re.DOTALL)

    # Extracted information
    def convert_to_text(re_search_result):
        return re_search_result.group(1).strip() if re_search_result else ""
    
    name_text = convert_to_text(name)
    position_text = convert_to_text(position)
    company_name_text = convert_to_text(company_name)
    date_transaction_text = convert_to_text(date_transaction)
    nature_text = convert_to_text(nature)
    price_text = convert_to_text(price)
    volume_text = convert_to_text(volume)
    date_notification_text = convert_to_text(date_notification)

    text_dict = {'name': name_text, 'position': position_text, 'company_name': company_name_text,
                 'date_transaction': date_transaction_text, 'date_notification': date_notification_text,
                 'nature': nature_text, 'price': price_text, 'volume': volume_text}
    
    return text_dict

folder_path = "./amf-pdfs/"

# Find all PDF files in the folder
pdf_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".pdf")]
print(f'list of PDF files: {pdf_files}')
print(f'number of PDF files: {len(pdf_files)}')

data = []

# Extract information from each PDF and store it in the dataframe
for file in pdf_files:
    new_info_dict = extract_info_from_pdf(file)
    data.append(new_info_dict)
df = pd.DataFrame(data)

df.head(10)

list of PDF files: ['./amf-pdfs/DD_22_873197_9545767.pdf', './amf-pdfs/DD_22_873757_9553231.pdf', './amf-pdfs/DD_22_877789_9603925.pdf', './amf-pdfs/DD_22_876817_9591571.pdf', './amf-pdfs/DD_22_865073_9445323.pdf', './amf-pdfs/DD_22_854540_9315544.pdf', './amf-pdfs/DD_22_858763_9367927.pdf', './amf-pdfs/DD_22_850873_9269347.pdf', './amf-pdfs/DD_22_867041_9468981.pdf', './amf-pdfs/DD_22_855424_9326881.pdf', './amf-pdfs/DD_22_858417_9362399.pdf', './amf-pdfs/DD_22_862191_9410349.pdf', './amf-pdfs/DD_22_877384_9598691.pdf', './amf-pdfs/DD_22_864573_9439220.pdf', './amf-pdfs/DD_22_856274_9336808.pdf', './amf-pdfs/DD_22_860657_9391726.pdf', './amf-pdfs/DD_22_866275_9459891.pdf', './amf-pdfs/DD_22_852612_9290833.pdf', './amf-pdfs/DD_22_872763_9540376.pdf', './amf-pdfs/DD_22_867721_9478005.pdf', './amf-pdfs/DD_22_859322_9374858.pdf', './amf-pdfs/DD_22_851032_9271658.pdf', './amf-pdfs/DD_22_869222_9496603.pdf', './amf-pdfs/DD_22_866933_9467798.pdf', './amf-pdfs/DD_22_877831_9604342.pdf', './am

Unnamed: 0,name,position,company_name,date_transaction,date_notification,nature,price,volume
0,ARJIL COMMANDITEE-ARCO SOCIETE ANONYME personn...,PRESIDENT DIRECTEUR\nGENERAL,LAGARDERE SA,22 novembre 2022,23 novembre 2022,Acquisition,18.9857,1 860.0000
1,The Home Bar Bevtech LTD personne morale liée ...,Président Directeur Général,SMART GOOD THINGS HOLDING S.A.,28 novembre 2022,28 novembre 2022,Cession,158.0,265.0000
2,Paul DU SAILLANT,Directeur Général Délégué,ESSILORLUXOTTICA,23 décembre 2022,29 décembre 2022,Cession,169.853,8 831.7470
3,Nicolas HIERONIMUS,Directeur Général et Administrateur,L'OREAL,16 décembre 2022,19 décembre 2022,DONATION DE LA NUE PROPRIETE EFFECTUEE,0.0,350 994.0000
4,paul lorne,Directeur Général Délégué,SPARTOO,04 octobre 2022,05 octobre 2022,Acquisition,1.23,4 470.0000
5,UBFT association reconnue d'utilité publique,membre du conseil d'administration,LA FRANCAISE DES JEUX,27 juillet 2022,28 juillet 2022,Acquisition,33.7589,8 532.0000
6,Aymerick PENICAUT,PDG,ASHLER & MANSON,30 août 2022,31 août 2022,Cession,2.9,1.0000
7,Laurence ILHE,Administrateur et Directeur Général Délégué,CLASQUIN,05 juillet 2022,06 juillet 2022,Acquisition,58.0,40.0000
8,Guillaume Demulier,President du Directoire,ROCHE BOBOIS SA,01 juillet 2022,18 octobre 2022,Acquisition definitive d'actions gratuites,0.0,22 200.0000
9,HOLDING DES DHUITS Société privée à responsabi...,Membre du\nConseil de Surveillance,JCDECAUX SA,29 juillet 2022,01 août 2022,Acquisition,15.458,13 919.0000


In [16]:
from datetime import datetime
import locale

# Set the French locale for month names
locale.setlocale(locale.LC_TIME, 'fr_FR.UTF-8')

# Convert the 'date' column to datetime format
df['date_transaction'] = pd.to_datetime(df['date_transaction'], format='%d %B %Y', errors='coerce')
df['date_notification'] = pd.to_datetime(df['date_notification'], format='%d %B %Y', errors='coerce')

# Reset the locale to the default setting
locale.setlocale(locale.LC_TIME, '')

df.head(10)


Unnamed: 0,name,position,company_name,date_transaction,date_notification,nature,price,volume
0,ARJIL COMMANDITEE-ARCO SOCIETE ANONYME personn...,PRESIDENT DIRECTEUR\nGENERAL,LAGARDERE SA,2022-11-22,2022-11-23,Acquisition,18.9857,1 860.0000
1,The Home Bar Bevtech LTD personne morale liée ...,Président Directeur Général,SMART GOOD THINGS HOLDING S.A.,2022-11-28,2022-11-28,Cession,158.0,265.0000
2,Paul DU SAILLANT,Directeur Général Délégué,ESSILORLUXOTTICA,2022-12-23,2022-12-29,Cession,169.853,8 831.7470
3,Nicolas HIERONIMUS,Directeur Général et Administrateur,L'OREAL,2022-12-16,2022-12-19,DONATION DE LA NUE PROPRIETE EFFECTUEE,0.0,350 994.0000
4,paul lorne,Directeur Général Délégué,SPARTOO,2022-10-04,2022-10-05,Acquisition,1.23,4 470.0000
5,UBFT association reconnue d'utilité publique,membre du conseil d'administration,LA FRANCAISE DES JEUX,2022-07-27,2022-07-28,Acquisition,33.7589,8 532.0000
6,Aymerick PENICAUT,PDG,ASHLER & MANSON,2022-08-30,2022-08-31,Cession,2.9,1.0000
7,Laurence ILHE,Administrateur et Directeur Général Délégué,CLASQUIN,2022-07-05,2022-07-06,Acquisition,58.0,40.0000
8,Guillaume Demulier,President du Directoire,ROCHE BOBOIS SA,2022-07-01,2022-10-18,Acquisition definitive d'actions gratuites,0.0,22 200.0000
9,HOLDING DES DHUITS Société privée à responsabi...,Membre du\nConseil de Surveillance,JCDECAUX SA,2022-07-29,2022-08-01,Acquisition,15.458,13 919.0000


In [17]:
# Ensure Price and Volume are Float values
df['price'] = df['price'].astype(str).str.replace(' ', '').astype(float)
df['volume'] = df['volume'].astype(str).str.replace(' ', '').astype(float)
df.tail(10)

Unnamed: 0,name,position,company_name,date_transaction,date_notification,nature,price,volume
2278,MAKEMO CAPITAL SOCIETE PAR ACTIONS SIMPLIFIEE ...,"Président de AF&Co\nManagement, elle-même Géra...",TIKEHAU CAPITAL,2022-08-17,2022-08-18,Acquisition,25.2192,1152.0
2279,UBFT association reconnue d'utilité publique,membre du conseil d'administration,LA FRANCAISE DES JEUX,2022-07-11,2022-07-12,Acquisition,31.5202,9138.0
2280,UBFT association reconnue d'utilité publique,membre du conseil d'administration,LA FRANCAISE DES JEUX,2022-07-05,2022-07-06,Acquisition,30.9551,9306.0
2281,Gilles Auffret,Membre du conseil d'administration,ELIOR GROUP,2022-12-29,2022-12-29,Acquisition,3.2985,5000.0
2282,PARFININCO SA personne morale liée à Jacques M...,Président-Directeur Général,VICAT S.A.,2022-09-06,2022-09-07,Acquisition,24.5898,10000.0
2283,STEPHANE GERART,CIO - Directeur BU IA - Membre du comex,ONCODESIGN,2022-10-18,2022-10-20,Cession,14.42,17640.0
2284,ISABELLE SIMON,Secrétaire général,THALES,2022-09-28,2022-09-28,Acquisition gratuite d'actions,0.0,2493.0
2285,BOUYGUES SA,"ADMINISTRATEUR DE TF1, FILIALE DIRECTE DE BOUY...",TELEVISION FRANCAISE 1,2022-08-16,2022-08-17,Acquisition,6.7872,10000.0
2286,Aymerick PENICAUT,PDG,ASHLER & MANSON,2022-08-17,2022-08-19,Cession,2.84,242.0
2287,Eric BAISSUS,President du Directoire de Kalray,KALRAY,2022-08-10,2022-10-18,Cession,23.1546,2500.0


In [18]:
# company_names = df['company_name'].unique()

# ticker_mapping = {}

# for company_name in company_names:
#     ticker_mapping[company_name] = '.PA'

# ticker_mapping = dict(sorted(ticker_mapping.items()))

# print(ticker_mapping)

import ticker_mapping
ticker_mapping = ticker_mapping.ticker_mapping

In [19]:
company_names = df['company_name'].unique()
missing_keys = [company_name for company_name in company_names if company_name not in ticker_mapping]
print(f'missing keys: {missing_keys}')

df['ticker'] = df['company_name'].map(ticker_mapping)

df.head(15)

missing keys: []


Unnamed: 0,name,position,company_name,date_transaction,date_notification,nature,price,volume,ticker
0,ARJIL COMMANDITEE-ARCO SOCIETE ANONYME personn...,PRESIDENT DIRECTEUR\nGENERAL,LAGARDERE SA,2022-11-22,2022-11-23,Acquisition,18.9857,1860.0,MMB.PA
1,The Home Bar Bevtech LTD personne morale liée ...,Président Directeur Général,SMART GOOD THINGS HOLDING S.A.,2022-11-28,2022-11-28,Cession,158.0,265.0,MLSGT.PA
2,Paul DU SAILLANT,Directeur Général Délégué,ESSILORLUXOTTICA,2022-12-23,2022-12-29,Cession,169.853,8831.747,EL.PA
3,Nicolas HIERONIMUS,Directeur Général et Administrateur,L'OREAL,2022-12-16,2022-12-19,DONATION DE LA NUE PROPRIETE EFFECTUEE,0.0,350994.0,OR.PA
4,paul lorne,Directeur Général Délégué,SPARTOO,2022-10-04,2022-10-05,Acquisition,1.23,4470.0,ALSPT.PA
5,UBFT association reconnue d'utilité publique,membre du conseil d'administration,LA FRANCAISE DES JEUX,2022-07-27,2022-07-28,Acquisition,33.7589,8532.0,FDJ.PA
6,Aymerick PENICAUT,PDG,ASHLER & MANSON,2022-08-30,2022-08-31,Cession,2.9,1.0,MLAEM.PA
7,Laurence ILHE,Administrateur et Directeur Général Délégué,CLASQUIN,2022-07-05,2022-07-06,Acquisition,58.0,40.0,ALCLA.PA
8,Guillaume Demulier,President du Directoire,ROCHE BOBOIS SA,2022-07-01,2022-10-18,Acquisition definitive d'actions gratuites,0.0,22200.0,RBO.PA
9,HOLDING DES DHUITS Société privée à responsabi...,Membre du\nConseil de Surveillance,JCDECAUX SA,2022-07-29,2022-08-01,Acquisition,15.458,13919.0,DEC.PA


In [20]:
import yfinance as yf
from datetime import timedelta

# Get current and future stock prices and variations
def get_stock_prices(row):
    try:
        ticker = row['ticker']
        start_day = row['date_transaction']
        end_day = start_day + timedelta(days=90)
        start_day = start_day.strftime('%Y-%m-%d')
        end_day = end_day.strftime('%Y-%m-%d')

        stock = yf.Ticker(ticker)
        stock_data = stock.history(start=start_day, end=end_day)
        prices = stock_data['Open']
        prices_dict = {
            'stock_price_open_d0': prices.values[0],
            'var_d1': prices.values[1] / prices.values[0],
            'var_d3': prices.values[3] / prices.values[0],
            'var_d5': prices.values[5] / prices.values[0],
            'var_d10': prices.values[10] / prices.values[0],
            'var_d20': prices.values[20] / prices.values[0],
            'var_d30': prices.values[30] / prices.values[0],
            'var_d60': prices.values[60] / prices.values[0]
        }
        return pd.Series(prices_dict)
    except:
        return {
            'stock_price_open_d0': float('nan'),
            'var_d1': float('nan'),
            'var_d3': float('nan'),
            'var_d5': float('nan'),
            'var_d10': float('nan'),
            'var_d20': float('nan'),
            'var_d30': float('nan'),
            'var_d60': float('nan')
        }

# Applying the function to new columns
df = pd.concat([df, df.apply(get_stock_prices, axis=1)], axis=1)

df.head(15)

df.to_pickle('amf-data.pkl')

- ???.PA: No data found for this date range, symbol may be delisted
- SECH.PA: No data found for this date range, symbol may be delisted
- SECH.PA: No data found for this date range, symbol may be delisted
- LTAN.PA: No data found, symbol may be delisted
- NGP.PA: No data found, symbol may be delisted
- MLPAP.PA: No data found, symbol may be delisted
- MLPAP.PA: No data found, symbol may be delisted
- LTAN.PA: No data found, symbol may be delisted
- SECH.PA: No data found for this date range, symbol may be delisted
- ???.PA: No data found for this date range, symbol may be delisted
- ???.PA: No data found for this date range, symbol may be delisted
- SECH.PA: No data found for this date range, symbol may be delisted
- MLPAP.PA: No data found, symbol may be delisted
- ROTH.PA: No data found, symbol may be delisted
- ???.PA: No data found for this date range, symbol may be delisted
- SECH.PA: No data found for this date range, symbol may be delisted
- ???.PA: No data found for this date 

In [None]:
import pandas as pd

# Test if cession and acquisition future variation are going up or down
amf_data = pd.read_pickle('amf-data.pkl')
amf_df = pd.DataFrame(amf_data)

amf_df = amf_df[amf_df['price'] != 0]

amf_df = amf_df[amf_df['position'].isin(['Président Directeur Général', 'PDG', 'Président-Directeur Général'])]

print(f'data length: {len(amf_df)}')

df_cession = amf_df[amf_df['nature'] == 'Cession']
df_acquisition = amf_df[amf_df['nature'] == 'Acquisition']

var_means = {
    'cession': {
        'd1': df_cession['var_d1'].mean(),
        'd3': df_cession['var_d3'].mean(),
        'd5': df_cession['var_d5'].mean(),
        'd10': df_cession['var_d10'].mean(),
        'd20': df_cession['var_d20'].mean(),
        'd30': df_cession['var_d30'].mean(),
        'd60': df_cession['var_d60'].mean(),
    },
    'acquisition' : {
        'd1': df_acquisition['var_d1'].mean(),
        'd3': df_acquisition['var_d3'].mean(),
        'd5': df_acquisition['var_d5'].mean(),
        'd10': df_acquisition['var_d10'].mean(),
        'd20': df_acquisition['var_d20'].mean(),
        'd30': df_acquisition['var_d30'].mean(),
        'd60': df_acquisition['var_d60'].mean(),
    }
}

var_medians = {
    'cession': {
        'd1': df_cession['var_d1'].median(),
        'd3': df_cession['var_d3'].median(),
        'd5': df_cession['var_d5'].median(),
        'd10': df_cession['var_d10'].median(),
        'd20': df_cession['var_d20'].median(),
        'd30': df_cession['var_d30'].median(),
        'd60': df_cession['var_d60'].median(),
    },
    'acquisition' : {
        'd1': df_acquisition['var_d1'].median(),
        'd3': df_acquisition['var_d3'].median(),
        'd5': df_acquisition['var_d5'].median(),
        'd10': df_acquisition['var_d10'].median(),
        'd20': df_acquisition['var_d20'].median(),
        'd30': df_acquisition['var_d30'].median(),
        'd60': df_acquisition['var_d60'].median(),
    }
}

var_means = {key: {inner_key: round(inner_value, 2) for inner_key, inner_value in value.items()} for key, value in var_means.items()}
var_medians = {key: {inner_key: round(inner_value, 2) for inner_key, inner_value in value.items()} for key, value in var_medians.items()}

print(var_means)
print(var_medians)