Step 1: Import the Dependencies

In [None]:
!pip install pandas



Step 2: Loading the SME Dataset

In [None]:
import pandas as pd

df = pd.read_csv("company_data.csv")

print(df.head(5))

                 Company Name    Industry  \
0             Johnson-Sanchez    Engineer   
1             Richardson-Pugh  Strategize   
2  Conrad, Mejia and Ferguson  Facilitate   
3                   Jones Ltd      Target   
4                  Roy-Miller  Whiteboard   

                                       Location  Size (Number of employees)  \
0  East Collintown, South Carolina, Puerto Rico                         727   
1             Lake Jennifer, Wisconsin, Armenia                        7454   
2        Hallport, Pennsylvania, Western Sahara                        3695   
3                   West Amyview, Ohio, Moldova                         922   
4                 South Margaret, Indiana, Iraq                        3641   

  Size (Revenue)                 Wallet Address (Public key)   GSTN (GST no.)  \
0     $4 million  0xb9a6b5b4a886e0c92393713974820aeea914a2dd  7IVT3XH1N007QLN   
1    $14 million  0xaacfd03813adf50aa46b6d5d0740aeb09e0bd8ff  7VPIZPBK069Q6QG   
2   $191 mi

Step 3: Data Cleaning

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)


Company Name                   0
Industry                       0
Location                       0
Size (Number of employees)     0
Size (Revenue)                 0
Wallet Address (Public key)    0
GSTN (GST no.)                 0
IT returns                     0
Aadhar no.                     0
PAN no.                        0
Amount of Funding Needed       0
Purpose of Funding             0
Preferred Funding Type         0
Mission Statement              0
Products/Services              0
Objectives                     0
Key Achievements               0
Industry Overview              0
Target Market                  0
Market Needs                   0
Market Trends                  0
Competitive Analysis           0
Revenue Streams                0
Cost Structure                 0
Key Partnerships               0
Distribution Channels          0
Customer Segments              0
Revenue Projections            0
Profit Projections             0
Customer Growth                0
Market Sha

In [None]:
# Drop rows with missing values
df_cleaned = df.dropna()

# Alternatively, fill missing values with a specific value
# df_cleaned = df.fillna('N/A')


In [None]:
# Remove duplicate rows
df_cleaned = df_cleaned.drop_duplicates()


Step 4: Data Preprocessing

In [None]:
def extract_numeric_value(revenue_str):
    # Use regex to extract digits and convert to float
    import re
    match = re.search(r'\$([\d,]+)', revenue_str)
    if match:
        # Remove commas and convert to float
        return float(match.group(1).replace(',', ''))
    return None

# Apply the function to the 'revenue_projections' column
df['revenue_projections_numeric'] = df['Revenue Projections'].apply(extract_numeric_value)

# Print the updated DataFrame to verify
print(df[['Revenue Projections', 'revenue_projections_numeric']].head())

               Revenue Projections  revenue_projections_numeric
0  $491 million (next fiscal year)                        491.0
1  $438 million (next fiscal year)                        438.0
2  $315 million (next fiscal year)                        315.0
3  $447 million (next fiscal year)                        447.0
4  $433 million (next fiscal year)                        433.0


In [None]:
df.columns

Index(['Company Name', 'Industry', 'Location', 'Size (Number of employees)',
       'Size (Revenue)', 'Wallet Address (Public key)', 'GSTN (GST no.)',
       'IT returns', 'Aadhar no.', 'PAN no.', 'Amount of Funding Needed',
       'Purpose of Funding', 'Preferred Funding Type', 'Mission Statement',
       'Products/Services', 'Objectives', 'Key Achievements',
       'Industry Overview', 'Target Market', 'Market Needs', 'Market Trends',
       'Competitive Analysis', 'Revenue Streams', 'Cost Structure',
       'Key Partnerships', 'Distribution Channels', 'Customer Segments',
       'Revenue Projections', 'Profit Projections', 'Customer Growth',
       'Market Share Projections', 'Expansion Plans',
       'Company Registration Number', 'Country of Incorporation',
       'Tax Identification Number', 'Patent/Trademark Number', 'Type'],
      dtype='object')

In [None]:
import pandas as pd
import re

# Load the CSV file into a DataFrame
df = pd.read_csv("company_data.csv")

# Function to extract numeric value from the revenue_projections column
def extract_numeric_value(revenue_str):
    match = re.search(r'\$([\d,]+)', revenue_str)
    if match:
        return float(match.group(1).replace(',', ''))
    return None

# Apply the function to the 'Revenue Projections' column
df['revenue_projections_numeric'] = df['Revenue Projections'].apply(extract_numeric_value)

# Print the updated DataFrame to verify
print(df[['Revenue Projections', 'revenue_projections_numeric']].head())

# Convert categorical variables to numeric using one-hot encoding
df_preprocessed = pd.get_dummies(df, columns=['Industry', 'Purpose of Funding', 'Preferred Funding Type'])

# Print the preprocessed DataFrame to verify
print(df_preprocessed.head())


               Revenue Projections  revenue_projections_numeric
0  $491 million (next fiscal year)                        491.0
1  $438 million (next fiscal year)                        438.0
2  $315 million (next fiscal year)                        315.0
3  $447 million (next fiscal year)                        447.0
4  $433 million (next fiscal year)                        433.0
                 Company Name                                      Location  \
0             Johnson-Sanchez  East Collintown, South Carolina, Puerto Rico   
1             Richardson-Pugh             Lake Jennifer, Wisconsin, Armenia   
2  Conrad, Mejia and Ferguson        Hallport, Pennsylvania, Western Sahara   
3                   Jones Ltd                   West Amyview, Ohio, Moldova   
4                  Roy-Miller                 South Margaret, Indiana, Iraq   

   Size (Number of employees) Size (Revenue)  \
0                         727     $4 million   
1                        7454    $14 million 

In [None]:
import pandas as pd
import re

# Load the CSV file into a DataFrame
df = pd.read_csv("company_data.csv")

# Function to extract numeric value from 'it_returns' column
def extract_year(it_returns_str):
    match = re.search(r'(\d{4})', it_returns_str)
    if match:
        return float(match.group(1))
    return None

# Apply the function to the 'it_returns' column to create a new column 'it_returns_year'
df['it_returns_year'] = df['IT returns'].apply(extract_year)

# Print the updated DataFrame to verify
print(df[['IT returns', 'it_returns_year']].head())


          IT returns  it_returns_year
0  Filed for FY 2019           2019.0
1  Filed for FY 2020           2020.0
2  Filed for FY 2021           2021.0
3  Filed for FY 2021           2021.0
4  Filed for FY 2022           2022.0


Step 5: Saving the Prepocessed Data

In [None]:
# Save the cleaned and preprocessed data to a new CSV file
df_preprocessed.to_csv('company_details_cleaned.csv', index=False)


Step 6: Load Lenders DataSet


In [None]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('lender_data.csv')

# Display the first few rows of the DataFrame
df.head()


Unnamed: 0,Lender Name,Type,Location,Preferred Industries,Preferred Loan Size Min,Preferred Loan Size Max,Loan Term Min,Loan Term Max,Risk Appetite,Lending History,Portfolio Details,Proof of Funds,Regulatory Compliance Documents,Wallet Address
0,Carol Bush,Institutional,"Chanfurt, Rhode Island, Sudan","Healthcare, Healthcare, Finance",235983,530234,4,8,Medium,"Lent to Small-Randolph, Wright Inc Ltd.","Currently lending to Woods Group Corp., Rodrig...",Available,Compliant,0xf57a95a0e390f1327831ef8739a5521b479908a8
1,Rebecca Long,Institutional,"Ronaldchester, Pennsylvania, Micronesia","Renewable Energy, Finance, Real Estate",287856,853680,2,6,Low,"Lent to Duncan PLC, Gentry-Davis Ltd.","Currently lending to Henry and Sons Corp., Han...",Available,Compliant,0x0994945718ca2dd708ef84d910273741b910cddb
2,Lance Atkinson,Individual,"Port Krista, Arkansas, Saint Lucia","Finance, Renewable Energy, Healthcare",144949,760298,5,4,High,"Lent to Costa Ltd, Simpson-Ferrell Ltd.","Currently lending to Owens, Spencer and Ball C...",Available,Compliant,0x1f8c61fa4c37dc72d1971949b05e9bf6f45beffc
3,Marissa Brown,Individual,"Kennethside, Arkansas, Paraguay","Technology, Renewable Energy, Finance",115982,531924,10,9,High,"Lent to Parker LLC, Morgan-Tyler Ltd.","Currently lending to Collins Ltd Corp., Mitche...",Available,Compliant,0xd86f9484ad56091f7ddc6389deae3972664953db
4,William Gonzalez,Institutional,"West Jason, South Carolina, Solomon Islands","Finance, Real Estate, Renewable Energy",431226,782967,10,6,High,"Lent to Patterson, Carter and Lopez, Sutton-Ta...","Currently lending to Fletcher-Brown Corp., Bar...",Available,Compliant,0x65264b7fc2a04e651756c1234670e6e7668429dd


Step 7: Data Cleaning

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)


Lender Name                        0
Type                               0
Location                           0
Preferred Industries               0
Preferred Loan Size Min            0
Preferred Loan Size Max            0
Loan Term Min                      0
Loan Term Max                      0
Risk Appetite                      0
Lending History                    0
Portfolio Details                  0
Proof of Funds                     0
Regulatory Compliance Documents    0
Wallet Address                     0
dtype: int64


In [None]:
# Drop rows with missing values
df_cleaned = df.dropna()

# Alternatively, fill missing values with a specific value
# df_cleaned = df.fillna('N/A')


In [None]:
# Remove duplicate rows
df_cleaned = df_cleaned.drop_duplicates()


Step 8: Data Preprocessing

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('lender_data.csv')

# Convert 'Preferred Loan Size Min' and 'Preferred Loan Size Max' to numeric
df['Preferred Loan Size Min'] = pd.to_numeric(df['Preferred Loan Size Min'], errors='coerce')
df['Preferred Loan Size Max'] = pd.to_numeric(df['Preferred Loan Size Max'], errors='coerce')

# Convert 'Loan Term Min' and 'Loan Term Max' to numeric (if not already)
df['Loan Term Min'] = pd.to_numeric(df['Loan Term Min'], errors='coerce')
df['Loan Term Max'] = pd.to_numeric(df['Loan Term Max'], errors='coerce')

# Display the first few rows to verify
print(df.head())

# Optionally, save the cleaned DataFrame to a new CSV file
df.to_csv('lender_data_cleaned.csv', index=False)


        Lender Name           Type  \
0        Carol Bush  Institutional   
1      Rebecca Long  Institutional   
2    Lance Atkinson     Individual   
3     Marissa Brown     Individual   
4  William Gonzalez  Institutional   

                                      Location  \
0                Chanfurt, Rhode Island, Sudan   
1      Ronaldchester, Pennsylvania, Micronesia   
2           Port Krista, Arkansas, Saint Lucia   
3              Kennethside, Arkansas, Paraguay   
4  West Jason, South Carolina, Solomon Islands   

                     Preferred Industries  Preferred Loan Size Min  \
0         Healthcare, Healthcare, Finance                   235983   
1  Renewable Energy, Finance, Real Estate                   287856   
2   Finance, Renewable Energy, Healthcare                   144949   
3   Technology, Renewable Energy, Finance                   115982   
4  Finance, Real Estate, Renewable Energy                   431226   

   Preferred Loan Size Max  Loan Term Min  Loan T

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Step 1: Load the cleaned CSV file
df = pd.read_csv('lender_data_cleaned.csv')

# Step 2: Convert categorical variables to numeric using one-hot encoding
df_preprocessed = pd.get_dummies(df, columns=['Type', 'Risk Appetite'])

# Step 3: List of numerical columns to scale
numerical_cols = ['Preferred Loan Size Min', 'Preferred Loan Size Max', 'Loan Term Min', 'Loan Term Max']

# Step 4: Initialize the scaler
scaler = StandardScaler()

# Step 5: Apply scaling to numerical columns
df_preprocessed[numerical_cols] = scaler.fit_transform(df_preprocessed[numerical_cols])

# Step 6: If you have a target column, split the data into features and target
# Assuming there's a 'target' column for demonstration purposes
if 'target' in df_preprocessed.columns:
    X = df_preprocessed.drop('target', axis=1)
    y = df_preprocessed['target']
else:
    X = df_preprocessed

# Step 7: Display the first few rows of the preprocessed data to verify
print(X.head())

# Step 8: Optionally, save the preprocessed DataFrame to a new CSV file
X.to_csv('lender_data_preprocessed.csv', index=False)

# Step 9: Print the list of extracted feature names
feature_names = X.columns.tolist()
print(len(feature_names))
print(feature_names)


        Lender Name                                     Location  \
0        Carol Bush                Chanfurt, Rhode Island, Sudan   
1      Rebecca Long      Ronaldchester, Pennsylvania, Micronesia   
2    Lance Atkinson           Port Krista, Arkansas, Saint Lucia   
3     Marissa Brown              Kennethside, Arkansas, Paraguay   
4  William Gonzalez  West Jason, South Carolina, Solomon Islands   

                     Preferred Industries  Preferred Loan Size Min  \
0         Healthcare, Healthcare, Finance                -0.552547   
1  Renewable Energy, Finance, Real Estate                -0.100575   
2   Finance, Renewable Energy, Healthcare                -1.345730   
3   Technology, Renewable Energy, Finance                -1.598121   
4  Finance, Real Estate, Renewable Energy                 1.148614   

   Preferred Loan Size Max  Loan Term Min  Loan Term Max  \
0                -1.516751      -0.510745       0.868891   
1                 0.736265      -1.208055       0.

Step 9: Extracting required features from SMEs and Lenders DataSet

In [None]:
pip install pandas scikit-learn



In [None]:
!pip install pandas scikit-learn surprise


Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357233 sha256=dbbd216678e1dfb6a4917babc22882d8467894a2fce61a954e9f8eb189d89194
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Install

In [None]:
import pandas as pd

# Load the data
company_data = pd.read_csv('company_data.csv')  # Uploaded file
lender_data = pd.read_csv('lender_data.csv')  # Uploaded file

# Print columns
print("Company Data Columns:", company_data.columns.tolist())
print("Lender Data Columns:", lender_data.columns.tolist())


Company Data Columns: ['Company Name', 'Industry', 'Location', 'Size (Number of employees)', 'Size (Revenue)', 'Wallet Address (Public key)', 'GSTN (GST no.)', 'IT returns', 'Aadhar no.', 'PAN no.', 'Amount of Funding Needed', 'Purpose of Funding', 'Preferred Funding Type', 'Mission Statement', 'Products/Services', 'Objectives', 'Key Achievements', 'Industry Overview', 'Target Market', 'Market Needs', 'Market Trends', 'Competitive Analysis', 'Revenue Streams', 'Cost Structure', 'Key Partnerships', 'Distribution Channels', 'Customer Segments', 'Revenue Projections', 'Profit Projections', 'Customer Growth', 'Market Share Projections', 'Expansion Plans', 'Company Registration Number', 'Country of Incorporation', 'Tax Identification Number', 'Patent/Trademark Number', 'Type']
Lender Data Columns: ['Lender Name', 'Type', 'Location', 'Preferred Industries', 'Preferred Loan Size Min', 'Preferred Loan Size Max', 'Loan Term Min', 'Loan Term Max', 'Risk Appetite', 'Lending History', 'Portfolio D

In [None]:
# Required fields for SMEs
sme_required_fields = [
    'Company Name', 'Company Registration Number', 'Country of Incorporation',
    'Executive Summary', 'Industry Overview', 'Target Market', 'Market Needs',
    'Market Trends', 'Competitive Analysis', 'Business Model', 'Growth Projections',
    'Balance Sheet', 'Income Statement', 'Credit Score', 'Funding History',
    'Current Debts', 'Company Registration Details', 'Tax Identification Number',
    'Patents/Trademarks', 'Wallet Address'
]

# Required fields for Investors
investor_required_fields = [
    'Investor Name', 'Type', 'Location', 'Preferred Industries',
    'Preferred Investment Size', 'Investment Horizon', 'Risk Appetite',
    'Investment History', 'Portfolio Details', 'Proof of Funds',
    'Regulatory Compliance Documents', 'Wallet Address'
]


In [None]:
# Get the actual columns from the datasets
company_data_columns = company_data.columns.tolist()
lender_data_columns = lender_data.columns.tolist()

# Find the matched fields
sme_matched_fields = [field for field in sme_required_fields if field in company_data_columns]
investor_matched_fields = [field for field in investor_required_fields if field in lender_data_columns]

print("Matched SME Fields:", sme_matched_fields)
print("Matched Investor Fields:", investor_matched_fields)

# Extract the matched fields from the datasets
sme_df = company_data[sme_matched_fields]
investor_df = lender_data[investor_matched_fields]

print("Extracted SME Data:", sme_df.head())
print("Extracted Investor Data:", investor_df.head())


Matched SME Fields: ['Company Name', 'Company Registration Number', 'Country of Incorporation', 'Industry Overview', 'Target Market', 'Market Needs', 'Market Trends', 'Competitive Analysis', 'Tax Identification Number']
Matched Investor Fields: ['Type', 'Location', 'Preferred Industries', 'Risk Appetite', 'Portfolio Details', 'Proof of Funds', 'Regulatory Compliance Documents', 'Wallet Address']
Extracted SME Data:                  Company Name  Company Registration Number  \
0             Johnson-Sanchez                   5084980374   
1             Richardson-Pugh                   2573193475   
2  Conrad, Mejia and Ferguson                   5818692525   
3                   Jones Ltd                   7757074578   
4                  Roy-Miller                   2918684175   

  Country of Incorporation                                  Industry Overview  \
0                  Ecuador  About drop think certainly rise. Five myself c...   
1                  Germany  Rich take need lot

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample data for SMEs
smes_data = {
    'Company Name': ['SME1', 'SME2', 'SME3'],
    'Company Registration Number': ['123', '456', '789'],
    'Country of Incorporation': ['USA', 'Canada', 'UK'],
    'Industry Overview': ['Technology', 'Healthcare', 'Finance'],
    'Target Market': ['Global', 'North America', 'Europe'],
    'Market Needs': ['Innovation', 'Funding', 'Growth'],
    'Market Trends': ['AI', 'Telemedicine', 'Fintech'],
    'Competitive Analysis': ['High', 'Medium', 'Low'],
    'Tax Identification Number': ['TIN123', 'TIN456', 'TIN789']
}

# Sample data for Investors
investors_data = {
    'Type': ['Venture Capital', 'Angel Investor', 'Private Equity'],
    'Location': ['USA', 'Canada', 'UK'],
    'Preferred Industries': ['Technology', 'Healthcare', 'Finance'],
    'Risk Appetite': ['High', 'Medium', 'Low'],
    'Portfolio Details': ['Tech Startups', 'Health Startups', 'Financial Startups'],
    'Proof of Funds': ['Yes', 'Yes', 'Yes'],
    'Regulatory Compliance Documents': ['Complete', 'Complete', 'Complete'],
    'Wallet Address': ['ADDR123', 'ADDR456', 'ADDR789']
}

# Convert dictionaries to DataFrames
smes = pd.DataFrame(smes_data)
investors = pd.DataFrame(investors_data)

# Display the sample data
print(smes)
print(investors)


  Company Name Company Registration Number Country of Incorporation  \
0         SME1                         123                      USA   
1         SME2                         456                   Canada   
2         SME3                         789                       UK   

  Industry Overview  Target Market Market Needs Market Trends  \
0        Technology         Global   Innovation            AI   
1        Healthcare  North America      Funding  Telemedicine   
2           Finance         Europe       Growth       Fintech   

  Competitive Analysis Tax Identification Number  
0                 High                    TIN123  
1               Medium                    TIN456  
2                  Low                    TIN789  
              Type Location Preferred Industries Risk Appetite  \
0  Venture Capital      USA           Technology          High   
1   Angel Investor   Canada           Healthcare        Medium   
2   Private Equity       UK              Finance    

In [None]:
import pickle

Step 11: Training the model using Content-Based Filtering

In [None]:
# Combine relevant fields into a single string for each SME and Investor
smes['combined_info'] = smes.apply(lambda row: f"{row['Industry Overview']} {row['Target Market']} {row['Market Needs']} {row['Market Trends']} {row['Competitive Analysis']}", axis=1)
investors['combined_info'] = investors.apply(lambda row: f"{row['Preferred Industries']} {row['Risk Appetite']} {row['Portfolio Details']} {row['Proof of Funds']} {row['Regulatory Compliance Documents']}", axis=1)

# Vectorize the combined information
vectorizer = TfidfVectorizer(stop_words='english')
smes_tfidf = vectorizer.fit_transform(smes['combined_info'])
investors_tfidf = vectorizer.transform(investors['combined_info'])

# Compute cosine similarity between SMEs and Investors
similarity_matrix = cosine_similarity(smes_tfidf, investors_tfidf)

# Save the model and vectorizer
with open('model.pkl', 'wb') as model_file:
    pickle.dump({
        'similarity_matrix': similarity_matrix,
        'smes': smes,
        'investors': investors
    }, model_file)

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

# Load the model and vectorizer
with open('model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

with open('vectorizer.pkl', 'rb') as vectorizer_file:
    vectorizer = pickle.load(vectorizer_file)

similarity_matrix = model['similarity_matrix']
smes = model['smes']
investors = model['investors']

# Function to get top N investor recommendations for a given SME
def get_top_investor_recommendations(sme_index, num_recommendations=2):
    similar_investors = list(enumerate(similarity_matrix[sme_index]))
    similar_investors = sorted(similar_investors, key=lambda x: x[1], reverse=True)
    top_investors = similar_investors[:num_recommendations]
    return [investors.iloc[i[0]] for i in top_investors]

# Example: Get recommendations for the first SME
recommendations = get_top_investor_recommendations(sme_index=0, num_recommendations=2)
for idx, investor in enumerate(recommendations):
    print(f"Recommendation {idx + 1}:")
    print(investor)
    print("\n")

Recommendation 1:
Type                                                          Venture Capital
Location                                                                  USA
Preferred Industries                                               Technology
Risk Appetite                                                            High
Portfolio Details                                               Tech Startups
Proof of Funds                                                            Yes
Regulatory Compliance Documents                                      Complete
Wallet Address                                                        ADDR123
combined_info                      Technology High Tech Startups Yes Complete
Name: 0, dtype: object


Recommendation 2:
Type                                                               Angel Investor
Location                                                                   Canada
Preferred Industries                                                   He