In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import nltk


nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\one_w\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\one_w\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Load datasets
investors_df = pd.read_csv('cleaned_openvc.csv')
ideas_df = pd.read_csv('cleaned_ideas.csv')

# Fill NaN values with an empty string
ideas_df['idea_description'] = ideas_df['idea_description'].fillna("")
investors_df['Investment thesis'] = investors_df['Investment thesis'].fillna("")

# Preprocess the text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text)  # Remove punctuation
    words = text.lower().split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatize and remove stopwords
    return words

investment_sentences = investors_df['Investment thesis'].apply(preprocess_text).tolist()
idea_sentences = ideas_df['idea_description'].apply(preprocess_text).tolist()


In [3]:
all_sentences = investment_sentences + idea_sentences
model = Word2Vec(sentences=all_sentences, vector_size=200, window=10, min_count=5, workers=4)

In [4]:
def compute_text_embedding(text, model):
    words = preprocess_text(text)
    valid_words = [model.wv[word] for word in words if word in model.wv]
    if valid_words:
        return np.mean(valid_words, axis=0)
    else:
        return np.zeros(model.vector_size)

investors_df['embedding'] = investors_df['Investment thesis'].apply(lambda x: compute_text_embedding(x, model))
ideas_df['embedding'] = ideas_df['idea_description'].apply(lambda x: compute_text_embedding(x, model))


In [5]:
def encode_and_combine(df, columns):
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded = encoder.fit_transform(df[columns])
    return pd.DataFrame(encoded, index=df.index)

investors_categoricals = encode_and_combine(investors_df, ['Countries of investment', 'Stage of investment', 'Investor type'])
ideas_categoricals = encode_and_combine(ideas_df, ['user_last_ip_country_name'])

In [6]:
investors_features = pd.concat([investors_categoricals, investors_df['embedding'].apply(pd.Series)], axis=1)
ideas_features = pd.concat([ideas_categoricals, ideas_df['embedding'].apply(pd.Series)], axis=1)

# Align feature dimensions and scale
investors_features_aligned, ideas_features_aligned = investors_features.align(ideas_features, fill_value=0, axis=1)
scaler = StandardScaler()
investors_features_scaled = scaler.fit_transform(investors_features_aligned)
ideas_features_scaled = scaler.transform(ideas_features_aligned)

In [7]:
cosine_similarities = cosine_similarity(ideas_features_scaled, investors_features_scaled)
cosine_similarity_df = pd.DataFrame(cosine_similarities, columns=investors_df['Investor name'], index=ideas_df['idea_description'])
print(cosine_similarity_df)

Investor name                                       3TS Capital Partners  \
idea_description                                                           
An organizer system that you install in handbag...              0.010556   
A business that does balloon animals for corpor...              0.014371   
                                                                0.100021   
I'm building a global innovation platform that ...             -0.001779   
sharikat tasnae eilajat lilkilab khaliatan min ...              0.021133   
A company that makes an app to stop food waste ...             -0.002464   
a business for tourists that uses a QR code to ...              0.002383   
A lemonade stand outside event venues to sell t...              0.018985   
A mems small satellite propulsion system for ma...             -0.000328   
BlockLenses are UV blocking daily-wear contact ...              0.021778   
Mainstream air transportation with low carbon e...              0.025294   
Digital mark

In [8]:
interaction_data_melted = cosine_similarity_df.reset_index().melt(id_vars='idea_description', 
                                                                  var_name='Investor name', 
                                                                  value_name='cosine_similarity')

reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(interaction_data_melted[['idea_description', 'Investor name', 'cosine_similarity']], reader)

trainset, testset = train_test_split(data, test_size=0.2)
model = SVD()
model.fit(trainset)
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse}")

RMSE: 0.0852
RMSE: 0.08524080602383861


In [9]:
def get_top_n_recommendations(predictions, n=5):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))
    
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    
    return top_n

top_n_recommendations = get_top_n_recommendations(predictions, n=5)
for idea, recommendations in top_n_recommendations.items():
    print(f"Recommended investors for the idea: '{idea}'")
    for investor, rating in recommendations:
        print(f"  - {investor} (predicted similarity: {rating:.2f})")
    print()



Recommended investors for the idea: 'I'm building a global innovation platform that allows entrepreneurs to build and launch their startups quickly. Using an AI-powered backend, founders can use a single sentence to create an enhanced lean canvas, validation roadmap, branding wheel, and more in about 90 seconds. We will then score the startup ideas and teams and connect the best ones to curated mentors and investors.'
  - Nova Growth Capital (predicted similarity: 0.10)
  - Coyote Ventures (predicted similarity: 0.08)
  - Treble Capital (predicted similarity: 0.08)
  - Lightship Capital (predicted similarity: 0.08)
  - 44 Capital Management (predicted similarity: 0.08)

Recommended investors for the idea: 'A cross-border payment processor for money transfers.'
  - Sheesha Finance (predicted similarity: 0.10)
  - Martlet Capital (predicted similarity: 0.09)
  - Frontier Ventures (predicted similarity: 0.09)
  - Zen Ventures (predicted similarity: 0.07)
  - Lair East Labs (predicted simi

In [10]:
from sklearn.cluster import MiniBatchKMeans

# Define number of clusters
n_clusters = 10

# Cluster investors and ideas
kmeans_investors = MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
investors_clusters = kmeans_investors.fit_predict(investors_features_scaled)

kmeans_ideas = MiniBatchKMeans(n_clusters=n_clusters, random_state=42)
ideas_clusters = kmeans_ideas.fit_predict(ideas_features_scaled)

# Add cluster labels to DataFrames
investors_df['cluster'] = investors_clusters
ideas_df['cluster'] = ideas_clusters

# Recommend VCs within the same cluster as the idea
def recommend_vcs_within_cluster(idea_index, investors_df, ideas_df, n=5):
    idea_cluster = ideas_df.iloc[idea_index]['cluster']
    same_cluster_vcs = investors_df[investors_df['cluster'] == idea_cluster]
    
    # Rank VCs within the same cluster by similarity score
    similarities = cosine_similarities[idea_index, same_cluster_vcs.index]
    sorted_indices = np.argsort(similarities)[::-1][:n]
    return same_cluster_vcs.iloc[sorted_indices]['Investor name'].tolist()

# Example: Recommendations for the first idea
idea_index = 0
recommended_vcs = recommend_vcs_within_cluster(idea_index, investors_df, ideas_df, n=5)
print(f"Recommended VCs for idea '{ideas_df.iloc[idea_index]['idea_description']}':")
print(recommended_vcs)


Recommended VCs for idea 'An organizer system that you install in handbags to make it easier and faster to find things.':
['Precursor Ventures']
