In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [14]:
df = pd.read_csv('corporate_finance_data.csv')
df.head()

Unnamed: 0,title,summary,content,links,url
0,Corporate finance,Corporate financeis an area offinancethat deal...,Asset and liability management\nBusiness plan\...,"['/wiki/Non-voting_stock', '/wiki/State_prices...",https://en.wikipedia.org/wiki/Corporate_finance
1,Principles of Corporate Finance,Principles of Corporate Financeis a reference ...,Principles of Corporate Financeis a reference ...,"['/wiki/Private_equity', '/wiki/Warrant_(finan...",https://en.wikipedia.org/wiki/Principles_of_Co...
2,Corporate Finance Institute,,,['/wiki/Geographic_coordinate_system'],https://en.wikipedia.org/wiki/Corporate_Financ...
3,Finance,Financerefers to monetary resources and to the...,Asset (economics)\nBond\nAsset growth\nCapital...,"['/wiki/Deposit_insurance', '/wiki/Professiona...",https://en.wikipedia.org/wiki/Finance
4,Outline of corporate finance,The followingoutlineis provided as an overview...,Asset and liability management\nBusiness plan\...,"['/wiki/Non-voting_stock', '/wiki/Financial_ri...",https://en.wikipedia.org/wiki/Outline_of_corpo...


## Data Wrangling

In [15]:
# Check the shape
print("Our dataset has {} samples and {} features.".format(*df.shape))

# Check for missing values
missing = df.isnull().sum()
missing = missing[missing > 0]
missing_percentage = missing / df.shape[0] * 100
missing_info = pd.DataFrame({'Missing Values': missing, 'Percentage': missing_percentage})
print(missing_info)

Our dataset has 9989 samples and 5 features.
         Missing Values  Percentage
summary             848    8.489338
content             842    8.429272


In [16]:
# Imputing summary and content features
# Since it is categorical, we will replace missing values with the mode
df['summary'] = df['summary'].fillna(df['summary'].mode()[0])
df['content'] = df['summary'].fillna(df['content'].mode()[0])

# Drop the links and url features as they are not useful
#df = df.drop(['url', 'links'], axis=1)

# Get categorical summary statistics
cat_summary = df.describe(include=['object']).T

# Add additional information
cat_summary['missing'] = df.isnull().sum()
cat_summary['unique'] = df.nunique()
cat_summary['dtype'] = df.dtypes

# Display the results
print("\nCategorical Variables Summary:")
print(cat_summary)


Categorical Variables Summary:
        count  unique                                              top freq  \
title    9989    8799                                Bank of East Asia    3   
summary  9989    7984                                               \n  929   
content  9989    7984                                               \n  929   
links    9989    8070           ['/wiki/Geographic_coordinate_system']  695   
url      9989    8799  https://en.wikipedia.org/wiki/Bank_of_East_Asia    3   

         missing   dtype  
title          0  object  
summary        0  object  
content        0  object  
links          0  object  
url            0  object  


In [17]:
df.head()

Unnamed: 0,title,summary,content,links,url
0,Corporate finance,Corporate financeis an area offinancethat deal...,Corporate financeis an area offinancethat deal...,"['/wiki/Non-voting_stock', '/wiki/State_prices...",https://en.wikipedia.org/wiki/Corporate_finance
1,Principles of Corporate Finance,Principles of Corporate Financeis a reference ...,Principles of Corporate Financeis a reference ...,"['/wiki/Private_equity', '/wiki/Warrant_(finan...",https://en.wikipedia.org/wiki/Principles_of_Co...
2,Corporate Finance Institute,\n,\n,['/wiki/Geographic_coordinate_system'],https://en.wikipedia.org/wiki/Corporate_Financ...
3,Finance,Financerefers to monetary resources and to the...,Financerefers to monetary resources and to the...,"['/wiki/Deposit_insurance', '/wiki/Professiona...",https://en.wikipedia.org/wiki/Finance
4,Outline of corporate finance,The followingoutlineis provided as an overview...,The followingoutlineis provided as an overview...,"['/wiki/Non-voting_stock', '/wiki/Financial_ri...",https://en.wikipedia.org/wiki/Outline_of_corpo...


In [18]:
# We want to do away completely with \n, \r, \t, and other special characters across the dataset
df = df.replace(r'\n', ' ', regex=True)
df = df.replace(r'\r', ' ', regex=True)
df = df.replace(r'\t', ' ', regex=True)
df = df.replace(r'\\n', ' ', regex=True)
df = df.replace(r'\\r', ' ', regex=True)
df = df.replace(r'\\t', ' ', regex=True)
df = df.replace(r'\s+', ' ', regex=True)

# Check the first 5 rows
df.head()

Unnamed: 0,title,summary,content,links,url
0,Corporate finance,Corporate financeis an area offinancethat deal...,Corporate financeis an area offinancethat deal...,"['/wiki/Non-voting_stock', '/wiki/State_prices...",https://en.wikipedia.org/wiki/Corporate_finance
1,Principles of Corporate Finance,Principles of Corporate Financeis a reference ...,Principles of Corporate Financeis a reference ...,"['/wiki/Private_equity', '/wiki/Warrant_(finan...",https://en.wikipedia.org/wiki/Principles_of_Co...
2,Corporate Finance Institute,,,['/wiki/Geographic_coordinate_system'],https://en.wikipedia.org/wiki/Corporate_Financ...
3,Finance,Financerefers to monetary resources and to the...,Financerefers to monetary resources and to the...,"['/wiki/Deposit_insurance', '/wiki/Professiona...",https://en.wikipedia.org/wiki/Finance
4,Outline of corporate finance,The followingoutlineis provided as an overview...,The followingoutlineis provided as an overview...,"['/wiki/Non-voting_stock', '/wiki/Financial_ri...",https://en.wikipedia.org/wiki/Outline_of_corpo...


In [21]:
# Drop url and links columns
df = df.drop(['url', 'links'], axis=1)
df.head()

Unnamed: 0,title,summary,content
0,Corporate finance,Corporate financeis an area offinancethat deal...,Corporate financeis an area offinancethat deal...
1,Principles of Corporate Finance,Principles of Corporate Financeis a reference ...,Principles of Corporate Financeis a reference ...
2,Corporate Finance Institute,,
3,Finance,Financerefers to monetary resources and to the...,Financerefers to monetary resources and to the...
4,Outline of corporate finance,The followingoutlineis provided as an overview...,The followingoutlineis provided as an overview...


In [22]:
# Define appropriate content for Corporate Finance Institute
cfi_summary = "Corporate Finance Institute (CFI) is a leading provider of online financial analyst certification programs and courses focused on financial modeling, valuation, and analytics."

cfi_content = "Corporate Finance Institute (CFI) is a global provider of financial analyst certification programs and professional development courses. It offers comprehensive training in financial modeling, valuation, corporate finance, business intelligence, and related financial topics. The institution provides industry-recognized certifications including the FMVA (Financial Modeling & Valuation Analyst), CBCA (Commercial Banking & Credit Analyst), and CMSA (Capital Markets & Securities Analyst) designations."

# Update the specific row
df.loc[2, 'summary'] = cfi_summary
df.loc[2, 'content'] = cfi_content

# Verify update
print(df.iloc[2])

# Check for missing values
print("Missing values in the dataset: ", df.isnull().sum().sum())

title                            Corporate Finance Institute
summary    Corporate Finance Institute (CFI) is a leading...
content    Corporate Finance Institute (CFI) is a global ...
Name: 2, dtype: object
Missing values in the dataset:  0


In [23]:
df.head()

Unnamed: 0,title,summary,content
0,Corporate finance,Corporate financeis an area offinancethat deal...,Corporate financeis an area offinancethat deal...
1,Principles of Corporate Finance,Principles of Corporate Financeis a reference ...,Principles of Corporate Financeis a reference ...
2,Corporate Finance Institute,Corporate Finance Institute (CFI) is a leading...,Corporate Finance Institute (CFI) is a global ...
3,Finance,Financerefers to monetary resources and to the...,Financerefers to monetary resources and to the...
4,Outline of corporate finance,The followingoutlineis provided as an overview...,The followingoutlineis provided as an overview...


In [24]:
# Save the cleaned dataset
df.to_csv('cleaned_corporate_finance_data.csv', index=False)