#### This file preprocesses the text file and extracts the information from the filings item wise.

In [48]:
import os

# Specify the folder path
folder_path = "MC Filings/"

# Initialize an empty array to store the data
data_array = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    # Check if the file is a text file
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        print(file_path)
        # Read the contents of the file
        with open(file_path, "r", encoding="utf-8") as file:
            # Read the contents and append to the data array
            data = file.read()
            data_array.append(data)

# Print the data array to verify
print(len(data_array))


MC Filings/ms1
MC Filings/ms10
MC Filings/ms11
MC Filings/ms12
MC Filings/ms13
MC Filings/ms14
MC Filings/ms2
MC Filings/ms3
MC Filings/ms4
MC Filings/ms5
MC Filings/ms6
MC Filings/ms7
MC Filings/ms8
MC Filings/ms9
14


#### Extract Years from the filings

In [49]:
year_array=[]
#Get Year 
import re

# Define the regular expression pattern
pattern = r"(?:FOR THE YEAR ENDED DECEMBER 31,|FOR THE YEAR ENDED DECEMBER\xa031,|THE\xa0YEAR ENDED DECEMBER\xa031,|the fiscal year ended December 31,)(.{5})"

for text in data_array:
    match = re.search(pattern, text)
    # If a match is found, extract the next 5 characters which form the year
    if match:
        next_characters = match.group(1)
        year_array.append(next_characters)
    else:
        year_array.append(None)
        print("Pattern not found in the text.")


Pattern not found in the text.


In [51]:
year_array

[' 2023',
 ' 2014',
 ' 2013',
 ' 2012',
 ' 2011',
 ' 2010',
 ' 2022',
 ' 2021',
 ' 2020',
 ' 2019 ',
 ' 2018',
 ' 2017',
 ' 2016',
 ' 2015']

In [52]:
data_array[0]

"2023-12-31UNITED STATESSECURITIES AND EXCHANGE COMMISSIONWashington, D.C. 20549Form10-K☒ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the fiscal year ended December\xa031, 2023 Or☐TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the transition period from \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 to \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0Commission file number: 001-32877 Mastercard Incorporated(Exact name of registrant as specified in its charter)Delaware13-4172551(State or other jurisdiction ofincorporation or organization)(IRS EmployerIdentification Number)2000 Purchase Street Purchase,NY10577(Address of principal executive offices)(Zip Code)(914) 249-2000 (Registrant’s telephone number, including area code)Securities registered pursuant to Section 12(b) of the Act:Title of each classTrading SymbolName of each exchange of which registeredClass A Common Stock, par value $0.0001 per sh

In [53]:
#extracting only the required info by removing the table of contents and other metadata
updated_data_array=[]
for data in data_array:
    updated_data_array.append("Business: "+" ".join(data.split("Business")[1:]))

In [54]:
updated_data_array[5]

'Business: '

#### Data Cleaning

In [55]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

def cleaning(text):
    # Remove HTML tags
    cleaned_text = re.sub(r'<[^>]+>', '', text)
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    tokens = word_tokenize(cleaned_text)
    #Remove punctuations
    tokens_without_punct = [token for token in tokens if token not in string.punctuation]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens_without_punct if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return lemmatized_tokens

In [56]:
cleaned_data=[]
for data in updated_data_array:
    cleaned_data.append(cleaning(data))

In [57]:
cleaned_data[0]

['business',
 '27item',
 '1a.risk',
 'factors41item',
 '1b.unresolved',
 'staff',
 'comments41item',
 '1c.cybersecurity43item',
 '2.properties43item',
 '3.legal',
 'proceedings43item',
 '4.mine',
 'safety',
 'disclosures44-information',
 'executive',
 'officerspart',
 'ii47item',
 '5.market',
 'registrant',
 '’',
 'common',
 'equity',
 'related',
 'stockholder',
 'matter',
 'issuer',
 'purchase',
 'equity',
 'securities47item',
 '6.reserved48item',
 '7.management',
 '’',
 'discussion',
 'analysis',
 'financial',
 'condition',
 'result',
 'operations62item',
 '7a.quantitative',
 'qualitative',
 'disclosure',
 'market',
 'risk64item',
 '8.financial',
 'statement',
 'supplementary',
 'data115item',
 '9.changes',
 'disagreement',
 'accountant',
 'accounting',
 'financial',
 'disclosure115item',
 '9a.controls',
 'procedures116item',
 '9b.other',
 'informationpart',
 'iii118item',
 '10.directors',
 'executive',
 'officer',
 'corporate',
 'governance118item',
 '11.executive',
 'compensation11

In [58]:
#split on the basis of items_number so that required items can be extracted
data_splitted_at_items=[]
for data in cleaned_data:
    text=" ".join(data)
    #remove unwanted unicode chars
    clean_text = text.replace("\u200b", "")
    items = re.split(r'(item \d+\.)', text)
    data_splitted_at_items.append(items)
    print(len(items))

295
69
69
67
67
1
293
291
283
291
57
61
61
59


In [59]:
txt=" ".join(cleaned_data[0])
items = re.split(r'(item \d+\.)', txt)
items

['business 27item 1a.risk factors41item 1b.unresolved staff comments41item 1c.cybersecurity43',
 'item 2.',
 'properties43',
 'item 3.',
 'legal proceedings43',
 'item 4.',
 'mine safety disclosures44-information executive officerspart ii47',
 'item 5.',
 'market registrant ’ common equity related stockholder matter issuer purchase equity securities47',
 'item 6.',
 'reserved48',
 'item 7.',
 'management ’ discussion analysis financial condition result operations62item 7a.quantitative qualitative disclosure market risk64',
 'item 8.',
 'financial statement supplementary data115',
 'item 9.',
 'changes disagreement accountant accounting financial disclosure115item 9a.controls procedures116item 9b.other informationpart iii118',
 'item 10.',
 'directors executive officer corporate governance118',
 'item 11.',
 'executive compensation118',
 'item 12.',
 'security ownership certain beneficial owner management related stockholder matters118',
 'item 13.',
 'certain relationship related trans

### These are the sections which will be focused.
### Apart from this entire 10-K filing is also stored after preprocessing. However above sections were explored in much more depth

In [60]:
# #Sections:
# 1. business - (includes risk factors, unresolved staff comments cybersecurity)
# 2. property-item 2.
# 3. legal proceedings-item 3.
# 4. market registrant-item 5.
# 5. exhibit financial statement schedule-item 15.

#Visualization:
# GPT API is able to extract numerical values from market registrant section

In [65]:
data_splitted_at_items[6]

['business 25item 1a.risk factors40item 1b.unresolved staff comments40',
 'item 2.',
 'properties40',
 'item 3.',
 'legal proceedings40',
 'item 4.',
 'mine safety disclosures41-information executive officerspart ii44',
 'item 5.',
 'market registrant ’ common equity related stockholder matter issuer purchase equity securities44',
 'item 6.',
 'reserved45',
 'item 7.',
 'management ’ discussion analysis financial condition result operations60item 7a.quantitative qualitative disclosure market risk62',
 'item 8.',
 'financial statement supplementary data113',
 'item 9.',
 'changes disagreement accountant accounting financial disclosure113item 9a.controls procedures113item 9b.other informationpart iii115',
 'item 10.',
 'directors executive officer corporate governance115',
 'item 11.',
 'executive compensation115',
 'item 12.',
 'security ownership certain beneficial owner management related stockholder matters115',
 'item 13.',
 'certain relationship related transaction director indepen

In [67]:
data_splitted_at_items.pop(5)

['business']

In [68]:
data_array=[]
i=0
for data in data_splitted_at_items:
    print(i)
    item1=data.index('item 1.')

    d={}
    d['Year']=year_array[i]
    if item1:
        data=data[item1:]
        business_text=''
        idx=0
        for j in range(len(data)):
            if data[j]=='item 1.':
                business_text+=data[j+1]
                idx=j
        if i==0:
            business_text=business_text.split('business')[1:]
            business_text=" ".join(business_text)
        business_text=business_text.split('item 1a')
        d['Business']=business_text[0]
        d['Risk Factors and Cyber Security']=business_text[1:]
#         data=data[idx:]
#         print(idx)
    else:
        d['Business']=data[1]   
        
    item2=data.index('item 2.')
    if item2:
        data=data[item2:]
        business_text=''
        idx=0
        for j in range(len(data)):
            if data[j]=='item 2.':
                business_text+=data[j+1]
                idx=j
        d['property']=business_text
    item3=data.index('item 3.')
    if item3:
        data=data[item3:]
        business_text=''
        idx=0
        for j in range(len(data)):
            if data[j]=='item 3.':
                business_text+=data[j+1]
                idx=j
        d['legal proceedings']=business_text
    item5=data.index('item 5.')
    if item5:
        data=data[item5:]
        business_text=''
        idx=0
        for j in range(len(data)):
            if data[j]=='item 5.':
                business_text+=data[j+1]
                idx=j
        d['market registrant']=business_text
    item2=data.index('item 15.')
    if item2:
        data=data[item2:]
        business_text=''
        idx=0
        for j in range(len(data)):
            if data[j]=='item 15.':
                business_text+=data[j+1]
                idx=j
        d['exhibit financial statement schedule']=business_text
#     item2=data.index('item 2.')
#     item3=data.index('item 3.')
#     item5=data.index('item 5.')
#     item15=data.index('item 15.')
#     print(item2,item3,item5,item15)
#     d['property']=data[item2+1]
#     d['legal proceedings']=data[item3+1]
#     d['market registrant']=data[item5+1]
#     d['exhibit financial statement schedule']=data[item15+1]
    data_array.append(d)
    i+=1

0
1
2
3
4
5
6
7
8
9
10
11
12


In [69]:
data_array[0]['Business']

' overviewmastercard technology company global payment industry connect consumer financial institution merchant government digital partner   organization worldwide enabling electronic payment making payment transaction safe simple smart accessible make payment easier efficient providing wide range payment solution service using family well-known trusted brand including mastercard® maestro® cirrus® operate multi-rail payment network provides choice flexibility consumer merchant customer unique proprietary core global payment network switch authorize clear settle payment transaction additional payment capability include automated clearing house “ ach ” transaction batch real-time account-based payment using capability offer payment product service capture new payment flow value-added service include among others cyber intelligence solution designed allow party transact securely easily confidence well service provide proprietary insight drawing principled responsible use secure consumer m

In [70]:
len(data_array)

13

In [71]:
import json
file_path = "MC_filings_preprocessed.json"

# Writing dictionary to JSON file
with open(file_path, "w") as json_file:
    json.dump(data_array, json_file, indent=4)

In [72]:
file_path = "MC_filings_preprocessed_Entire_data.json"

# Writing dictionary to JSON file
with open(file_path, "w") as json_file:
    json.dump([year_array,data_splitted_at_items], json_file, indent=4)