**Form4 filings Part 1**

In the following code cell, provide the code for your function.

In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET

def CreateDataFrame(basefolder):
    def extract_own(file_content):
        start = '<ownershipDocument>'
        end = '</ownershipDocument>'
        if start in file_content and end in file_content:
            i1 = file_content.index(start)
            i2 = file_content.index(end) + len(end)
            return file_content[i1:i2]
        return None

    def make_features(own_content):
        transactions = []
        root = ET.fromstring(own_content)
        if len(root.findall('.//rptOwnerName')) > 1:
            return transactions
        owner_name_tag = root.find('.//rptOwnerName')
        if owner_name_tag is None:
            return transactions 

        owner_name = owner_name_tag.text
        is_dir_tag = root.find('.//reportingOwnerRelationship/isDirector')
        is_off_tag = root.find('.//reportingOwnerRelationship/isOfficer')
        is_ten_tag = root.find('.//reportingOwnerRelationship/isTenPercentOwner')

        is_dir = is_dir_tag.text if is_dir_tag is not None else '0'
        is_off = is_off_tag.text if is_off_tag is not None else '0'
        is_ten = is_ten_tag.text if is_ten_tag is not None else '0'

        for transaction in root.findall('.//nonDerivativeTransaction'):
            transaction_data = {
                'OwnerName': owner_name,
                'IsDir': is_dir,
                'IsOff': is_off,
                'IsTen': is_ten,
                'SecTitle': transaction.find('.//securityTitle/value').text,
                'TransDate': transaction.find('.//transactionDate/value').text,
                'Shares': transaction.find('.//transactionShares/value').text,
                'PPS': transaction.find(".//transactionPricePerShare/value").text if transaction.find(".//transactionPricePerShare/value") is not None else '',
                'ADCode': transaction.find('.//transactionAcquiredDisposedCode/value').text,
                'SharesAfter': transaction.find('.//postTransactionAmounts/sharesOwnedFollowingTransaction/value').text,
                'DIOwner': transaction.find('.//ownershipNature/directOrIndirectOwnership/value').text
            }
            transactions.append(transaction_data)
        return transactions

    all_transactions = []
    for root_dir, _, files in os.walk(basefolder):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root_dir, file)
                with open(file_path, 'r') as txt_file:
                    content = txt_file.read()
                own_content = extract_own(content)
                if own_content:
                    transactions = make_features(own_content)
                    for transaction in transactions:
                        transaction['Folder'] = os.path.basename(root_dir)
                        all_transactions.append(transaction)

    ans = pd.DataFrame(all_transactions)
    columns = list(ans.columns)
    columns.insert(0, columns.pop(-1))
    ans = ans[columns]
    return ans

**Read in the cleaned up dataset from last time**

In [5]:
df = CreateDataFrame("/Users/lisonghe/Library/CloudStorage/OneDrive-JohnsHopkins/Semester 1/688 Computing for AM/Final project/PFE/4")
df.head(15)

Unnamed: 0,Folder,OwnerName,IsDir,IsOff,IsTen,SecTitle,TransDate,Shares,PPS,ADCode,SharesAfter,DIOwner
0,0001225208-17-016837,YOUNG JOHN D,0,1,0,Common Stock,2017-11-01,4219.0,0.0,D,124257.0,D
1,0001225208-22-003668,Carapezzi William R JR,0,1,0,Common Stock,2022-02-26,21193.0,25.6,A,87363.0,D
2,0001225208-22-003668,Carapezzi William R JR,0,1,0,Common Stock,2022-02-26,4619.0,47.72,D,82744.0,D
3,0001225208-22-003668,Carapezzi William R JR,0,1,0,Common Stock,2022-02-26,9895.0,50.64,D,72849.0,D
4,0001225208-22-003668,Carapezzi William R JR,0,1,0,Common Stock,2022-02-28,1795.0,46.94,D,71054.0,D
5,0001225208-21-004300,BOURLA ALBERT,1,1,0,Common Stock,2021-02-27,53925.0,22.35,A,165034.0,D
6,0001225208-21-004300,BOURLA ALBERT,1,1,0,Common Stock,2021-02-27,9664.0,33.49,D,155370.0,D
7,0001225208-21-004300,BOURLA ALBERT,1,1,0,Common Stock,2021-02-27,34191.0,34.64,D,121179.0,D
8,0000078003-06-000035,LEVIN ALAN G,0,1,0,Common Stock,2006-02-23,23070.0,0.0,A,335099.0,D
9,0000078003-06-000035,LEVIN ALAN G,0,1,0,Common Stock,2006-02-23,48720.0,0.0,A,383819.0,D


In [3]:
df.to_csv('Part 1B.csv', index = False)