In [1]:
import os
import sys
import re
import json
from ollama import chat
from ollama import ChatResponse

current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()

project_root = os.path.abspath(os.path.join(current_dir, '..'))

data_file = os.path.join(project_root, 'data', 'crunchbase_data.xlsx')

if project_root not in sys.path:
    sys.path.insert(0, project_root)

print("current working directory:", os.getcwd())
print("project root:", project_root)

from API_wrapper.OpenAI_API import OpenAI_API
from utils.utils import get_prompt, extract_json_from_llm_response

import pandas as pd


current working directory: d:\Study\Level4\grad project\VC-management-system\EDA
project root: d:\Study\Level4\grad project\VC-management-system


In [2]:
df = pd.read_excel(data_file)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1595 entries, 0 to 1594
Data columns (total 57 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   index                            1595 non-null   int64  
 1   co-founder1                      1595 non-null   object 
 2   co-founder2                      832 non-null    object 
 3   organization name                1595 non-null   object 
 4   country                          1595 non-null   object 
 5   google_search_query_founder1     1595 non-null   object 
 6   google_search_query_founder2     832 non-null    object 
 7   co-founder1 linkedin profile     1595 non-null   object 
 8   co-founder2 linkedin profile     3 non-null      object 
 9   co-founder1 cv                   1595 non-null   object 
 10  key                              1595 non-null   object 
 11  Unnamed: 0                       1595 non-null   int64  
 12  cb rank (company)   

In [4]:
def ask_deepseek(input_content, system_prompt, deep_think = True, print_log = True):
    response: ChatResponse = chat(model='deepseek-r1:8b', messages=[
        {'role' : 'system', 'content' : system_prompt},
        {'role': 'user','content': input_content}
    ])
    response_text = response['message']['content']
    if print_log: print(response_text)
    think_texts = re.findall(r'<think>(.*?)</think>', response_text, flags=re.DOTALL)
    think_texts = "\n\n".join(think_texts).strip()
    clean_response= re.sub(r'<think>.*?</think>', '', response_text, flags=re.DOTALL).strip()

    return clean_response if not deep_think else (clean_response, think_texts)

In [5]:
def annotate_row(row):

    if pd.notnull(row.get('organization full description')):
        org_desc = str(row['organization full description'])
    else:
        org_desc = str(row['organization description'])

    industries = [
        str(row[f'industry{i}']).strip()
        for i in range(1, 6)
        if f'industry{i}' in row and pd.notnull(row[f'industry{i}']) and str(row[f'industry{i}']).strip()
    ]
    org_industries = ', '.join(industries)

    founder_cv = str(row['co-founder1 cv'])

    prompt = get_prompt(
        "founder_idea_fit_score",
        organization_description=org_desc,
        organization_industries=org_industries,
        founder_cv=founder_cv
    )

    response_text = ask_deepseek(
        input_content=prompt,
        system_prompt="",
        deep_think=False,
        print_log=False
    )
    print(response_text)
    json_response = extract_json_from_llm_response(response_text)
    if json_response is None:
        print("No valid JSON response found.")
        parsed = {"raw_response": response_text}
        return None
    else:
        parsed = json_response


    output = {
        'founder_idea_fit_score': parsed.get('score'),
        'founder_idea_fit_score_explanation': parsed.get('explanation'),
        'breakdown_domain_expertise': parsed.get('breakdown', {}).get('domain_expertise'),
        'breakdown_execution_capability': parsed.get('breakdown', {}).get('execution_capability'),
        'breakdown_resilience_traction': parsed.get('breakdown', {}).get('resilience_traction'),
    }

    for field in ['strengths', 'critical_gaps', 'recommendations']:
        value = parsed.get(field)
        if isinstance(value, str) and value.strip().startswith('[') and value.strip().endswith(']'):
            try:
                parsed_list = json.loads(value)
                if isinstance(parsed_list, list):
                    value = parsed_list
            except json.JSONDecodeError:
                pass
        if isinstance(value, list):
            output[field] = ', '.join(map(str, value))
        else:
            output[field] = str(value) if value is not None else None

    return output

In [None]:
output_columns = [
    'founder_idea_fit_score',
    'founder_idea_fit_score_explanation',
    'breakdown_domain_expertise',
    'breakdown_execution_capability',
    'breakdown_resilience_traction',
    'strengths',
    'critical_gaps',
    'recommendations'
]

for col in output_columns:
    if col not in df.columns:
        df[col] = None

for idx, row in df.iterrows():
    if pd.notnull(row['founder_idea_fit_score']):
        print(f"Row {idx} already processed, skipping.")
        continue

    print(f"Processing row {idx}...")
    result = annotate_row(row)

    if result is not None:
        for key in result:
            df.at[idx, key] = result[key]

    if idx % 50 == 0 and idx != 0:
        df.to_excel('checkpoint_fids.xlsx', index=False)
        print(f"Checkpoint saved at row {idx}")

# Save final
df.to_excel('final_result_fids.xlsx', index=False)
print("Done! Final file saved as 'final_result.xlsx'")


Processing row 0...


In [None]:
# annotated = df.apply(annotate_row, axis=1)

# annotated_df = pd.json_normalize(annotated)

# df = pd.concat([df, annotated_df], axis=1)

Processing row: 0
```json
{
  "score": 7,
  "explanation": "The founder demonstrates significant domain expertise in semiconductors with over two decades of experience. Execution capability is moderate as they can manage teams but may need support for technical gaps. Resilience and traction are emerging, showing potential but not yet proven at scale.",
  "breakdown": {
    "domain_expertise": 4,
    "execution_capability": 2,
    "resilience_traction": 2
  },
  "strengths": [
    "Deep expertise in semiconductor ICs and mixed-signal design",
    "Proven leadership in managing engineering teams"
  ],
  "critical_gaps": [
    "Potential need for complementary technical skills",
    "Limited time as CEO may affect traction building"
  ],
  "recommendations": [
    "Recruit experts in software development and operations",
    "Expand network and partnerships to accelerate growth",
    "Focus on building a strong, reliable technical team"
  ]
}
```


In [None]:
# def annotate_row(row):
#     print("Processing row:", row.name)

#     if pd.notnull(row.get('organization full description')):
#         org_desc = str(row['organization full description'])
#     else:
#         org_desc = str(row['organization description'])

#     industries = [
#         str(row[f'industry{i}']).strip()
#         for i in range(1, 6)
#         if f'industry{i}' in row and pd.notnull(row[f'industry{i}']) and str(row[f'industry{i}']).strip()
#     ]
#     org_industries = ', '.join(industries)
#     founder_cv = str(row['co-founder1 cv'])

#     prompt = get_prompt(
#         "founder_idea_fit_score",
#         organization_description=org_desc,
#         organization_industries=org_industries,
#         founder_cv=founder_cv
#     )

#     llm = OpenAI_API()
#     response = llm.get_response(system_content="", user_content=prompt)
#     result_json = response.json()["choices"][0]["message"]["content"]

#     try:
#         parsed = json.loads(result_json)
#     except json.JSONDecodeError:
#         parsed = {"raw_response": result_json}

#     output = {
#         'founder_idea_fit_score': parsed.get('score'),
#         'founder_idea_fit_score_explanation': parsed.get('explanation'),
#         'breakdown_domain_expertise': parsed.get('breakdown', {}).get('domain_expertise'),
#         'breakdown_execution_capability': parsed.get('breakdown', {}).get('execution_capability'),
#         'breakdown_resilience_traction': parsed.get('breakdown', {}).get('resilience_traction'),
#     }

#     for field in ['strengths', 'critical_gaps', 'recommendations']:
#         value = parsed.get(field)

#         if isinstance(value, str) and value.strip().startswith('[') and value.strip().endswith(']'):
#             try:
#                 parsed_list = json.loads(value)
#                 if isinstance(parsed_list, list):
#                     value = parsed_list
#             except json.JSONDecodeError:
#                 pass

#         if isinstance(value, list):
#             output[field] = ', '.join(map(str, value))
#         else:
#             output[field] = str(value) if value is not None else None

#     return output

In [42]:
df.head()

Unnamed: 0,index,co-founder1,co-founder2,organization name,country,google_search_query_founder1,google_search_query_founder2,co-founder1 linkedin profile,co-founder2 linkedin profile,co-founder1 cv,...,organization twitter,organization website,founder_idea_fit_score,founder_idea_fit_score_explanation,breakdown_domain_expertise,breakdown_execution_capability,breakdown_resilience_traction,strengths,critical_gaps,recommendations
0,0,Ahmed Aboulella,,InfiniLink,Egypt,"site:linkedin.com/in Ahmed Aboulella ""InfiniLi...",,https://eg.linkedin.com/in/ahmedfaboulella,,Headline:\nCo-Founder & CEO @ InfiniLink\n\n\n...,...,,https://infinilink.ai/,7,The founder demonstrates significant domain ex...,4,2,2,Extensive semiconductor expertise with relevan...,Limited experience in scaling a company or ach...,Leverage existing industry contacts to form an...
