In [None]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Section A: Data Preprocessing and Cleaning

In [None]:
problem_solution_df = pd.read_csv('/content/drive/MyDrive/AI EarthHack Dataset.csv', encoding='latin-1')
problem_solution_df.head(10)

Unnamed: 0,id,problem,solution
0,1,The construction industry is indubitably one o...,"Herein, we propose an innovative approach to m..."
1,2,"I'm sure you, like me, are feeling the heat - ...","Imagine standing on a green hill, not a single..."
2,3,The massive shift in student learning towards ...,"Implement a """"Book Swap"""" program within educa..."
3,4,The fashion industry is one of the top contrib...,The proposed solution is a garment rental serv...
4,5,The majority of the materials used in producin...,An innovative concept would be a modular elect...
5,6,Businesses worldwide expend substantial financ...,The proposed solution involves developing a se...
6,7,more than 130 Billon plastic bottles waste ann...,Bariq factory to recyle plastic bottels
7,8,"In congested cities like Berlin, one of the si...",Let's revolutionize the carsharing experience...
8,9,One major global issue we face today is the su...,"My solution is an innovative Reloop - System, ..."
9,10,The usage of plastic bottles,"Creating a service that sells bottles, and re-..."


## A. Preprocessing

We must check for any missing values. We find that only 1 row has missing values. So, we can drop it.

In [None]:
nan_values = problem_solution_df[problem_solution_df.isnull().any(axis=1)]
print(f"Before change:\n{nan_values}\n")

problem_solution_df.dropna(inplace=True)

if problem_solution_df.isnull().any().any():
    print("Missing values found after change.")
else:
    print("No empty values after change.")

Before change:
        id                                            problem solution
1030  1031  The global fashion industry is a significant c...      NaN

No empty values after change.


After dropping missing values, the dataset is now ready to be processed.

# Section B: Clustering Problems

To establish common themes, we cluster problems to find the common themes underlying the provided detaset.

In [None]:
import nltk

In [None]:
import pandas as pd
import nltk
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Load the dataset and extract the problem column
data = problem_solution_df.copy()
problem_descriptions = data['problem']
solution_descriptions = data['solution']

# Preprocess the text data
lemmatizer = WordNetLemmatizer()
stopwords_set = set(stopwords.words('english'))  # Update stopwords according to your domain
common_words = ['the', 'and', 'of', 'to', 'is', 'in', 'for', 'this', 'are', 'it', 'by']

def preprocess_text(text):
    # Apply lowercase
    text = text.lower()
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords, non-alphabetic characters, and articles
    tokens = [token for token in tokens if token.isalpha() and token not in stopwords_set and token not in common_words]
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join the processed tokens back into a single string
    processed_text = ' '.join(tokens)
    return processed_text

# Preprocess the problem descriptions
preprocessed_problem_descriptions = problem_descriptions.apply(preprocess_text)
preprocessed_solution_descriptions = solution_descriptions.apply(preprocess_text)

# Tokenize the preprocessed text data
tokenized_problem_descriptions = [nltk.word_tokenize(description) for description in preprocessed_problem_descriptions]
tokenized_solution_descriptions = [nltk.word_tokenize(description) for description in preprocessed_solution_descriptions]

# Create a dictionary from the tokenized descriptions
problem_dictionary = Dictionary(tokenized_problem_descriptions)
solution_dictionary = Dictionary(tokenized_solution_descriptions)

# Create a document-term matrix using the dictionary
problem_corpus = [problem_dictionary.doc2bow(tokens) for tokens in tokenized_problem_descriptions]
solution_corpus = [solution_dictionary.doc2bow(tokens) for tokens in tokenized_solution_descriptions]

# Find the optimal number of topics using the coherence score for problems
start_topics = 2
end_topics = 10
topic_range = range(start_topics, end_topics + 1)
problem_coherence_scores = []

for num_topics in topic_range:
    problem_lda_model = LdaModel(corpus=problem_corpus, id2word=problem_dictionary, num_topics=num_topics, random_state=42)
    problem_coherence_model = CoherenceModel(model=problem_lda_model, texts=tokenized_problem_descriptions, dictionary=problem_dictionary, coherence='c_v')
    coherence_score = problem_coherence_model.get_coherence()
    problem_coherence_scores.append(coherence_score)

# Determine the optimal number of topics based on coherence scores for problems
optimal_num_topics_problem = topic_range[problem_coherence_scores.index(max(problem_coherence_scores))]

# Find the optimal number of topics using the coherence score for solutions
solution_coherence_scores = []

for num_topics in topic_range:
    solution_lda_model = LdaModel(corpus=solution_corpus, id2word=solution_dictionary, num_topics=num_topics, random_state=42)
    solution_coherence_model = CoherenceModel(model=solution_lda_model, texts=tokenized_solution_descriptions, dictionary=solution_dictionary, coherence='c_v')
    coherence_score = solution_coherence_model.get_coherence()
    solution_coherence_scores.append(coherence_score)

# Determine the optimal number of topics based on coherence scores for solutions
optimal_num_topics_solution = topic_range[solution_coherence_scores.index(max(solution_coherence_scores))]

# Create the LDA models with the optimal number of topics
problem_lda_model = LdaModel(corpus=problem_corpus, id2word=problem_dictionary, num_topics=optimal_num_topics_problem, random_state=42)
solution_lda_model = LdaModel(corpus=solution_corpus, id2word=solution_dictionary, num_topics=optimal_num_topics_solution, random_state=42)

# Assign cohesive names to the topics for problems
problem_topic_names = {}
num_top_words = 10  # Define the number of top words to display for each topic

for topic_idx, topic in enumerate(problem_lda_model.get_topics()):
    top_words = [problem_dictionary[i] for i in topic.argsort()[:-num_top_words - 1:-1] if problem_dictionary[i] not in common_words]
    problem_topic_names[topic_idx] = ' '.join(top_words)

# Assign cohesive names to the topics for solutions
solution_topic_names = {}

# Assign cohesive names to the topics for solutions
solution_topic_names = {}

for topic_idx, topic in enumerate(solution_lda_model.get_topics()):
    top_words = [solution_dictionary[i] for i in topic.argsort()[:-num_top_words - 1:-1] if solution_dictionary[i] not in common_words]
    solution_topic_names[topic_idx] = ' '.join(top_words)

# Assign cohesive names to the topics for solutions
solution_topic_names = {}

for topic_idx, topic in enumerate(solution_lda_model.get_topics()):
    top_words = [solution_dictionary[i] for i in topic.argsort()[:-num_top_words - 1:-1] if solution_dictionary[i] not in common_words]
    solution_topic_names[topic_idx] = ' '.join(top_words)

# Assign topics to problem descriptions
problem_topics = [problem_lda_model.get_document_topics(doc) for doc in problem_corpus]

# Assign topics to solution descriptions
solution_topics = [solution_lda_model.get_document_topics(doc) for doc in solution_corpus]

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=['Problem Description', 'Problem Topic Name', 'Solution Description', 'Solution Topic Name'])

# Populate the DataFrame with problem and solution descriptions along with corresponding topic names
for idx, (problem_desc, solution_desc) in enumerate(zip(preprocessed_problem_descriptions, preprocessed_solution_descriptions)):
    problem_topic = max(problem_topics[idx], key=lambda x: x[1])[0]
    solution_topic = max(solution_topics[idx], key=lambda x: x[1])[0]
    results_df.loc[idx] = [problem_desc, problem_topic_names[problem_topic], solution_desc, solution_topic_names[solution_topic]]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import pandas as pd

# Get unique keywords from both columns
unique_keywords = set(results_df["Problem Topic Name"].tolist() + results_df["Solution Topic Name"].tolist())

# Print unique keywords for user selection
print("Available keywords:")
for keyword in unique_keywords:
    print(keyword)

def keyword_matches(row, keyword):
    # Handle missing values (NaN) appropriately:
    problem_topic_name = row["Problem Topic Name"] if not pd.isna(row["Problem Topic Name"]) else ""
    solution_topic_name = row["Solution Topic Name"] if not pd.isna(row["Solution Topic Name"]) else ""
    return (keyword in problem_topic_name) or (keyword in solution_topic_name)

# Get multiple keyword selections from the user
selected_keywords = []
while True:
    keyword = input("Enter a keyword (or type 'done' to finish): ")
    if keyword.lower() == "done":
        break
    selected_keywords.append(keyword)

# Create a new DataFrame based on the selected keywords
filtered_df = results_df[results_df.apply(
    lambda row: any(keyword_matches(row, keyword) for keyword in selected_keywords), axis=1)]

print("\nNew DataFrame with the selected keyword:")
print(filtered_df)

filtered_df.to_csv('screen_1_output.csv')
filtered_df

Available keywords:
waste solution business also model product could recycling material plastic
waste business circular product would could material also new economy
waste plastic environmental resource industry material significant pollution problem environment
would waste also platform model consumer system circular new business
waste business system material would platform model food recycling solution
waste product packaging solution circular food economy environmental use plastic
model business product company could material new waste solution consumer
waste product solution material company would resource recycling device business
waste environmental industry fashion pollution significant contributes resource landfill due
business could environmental material waste would model also economy circular
waste environmental industry also issue amount significant due problem energy
model business waste product material new could service also company
recycling model business service also

Unnamed: 0,Problem Description,Problem Topic Name,Solution Description,Solution Topic Name
0,construction industry indubitably one signific...,waste plastic environmental resource industry ...,herein propose innovative approach mitigate pr...,business could environmental material waste wo...
1,sure like feeling heat literally world health ...,waste environmental industry fashion pollution...,imagine standing green hill single towering no...,waste solution business also model product cou...
2,massive shift student learning towards digital...,waste environmental industry also issue amount...,implement book swap program within educational...,would waste also platform model consumer syste...
3,fashion industry one top contributor global po...,waste environmental industry fashion pollution...,proposed solution garment rental service servi...,model business waste product material new coul...
4,majority material used producing electronic go...,waste environmental resource electronic proble...,innovative concept would modular electronic de...,model business product company could material ...
...,...,...,...,...
1294,linear make dispose model production consumpti...,waste plastic environmental resource industry ...,addressing problem circular economy approach w...,would waste also platform model consumer syste...
1295,conundrum face improper disposal sanitary pad ...,waste environmental resource electronic proble...,proposed solution recycling machine transformi...,waste product solution material company would ...
1296,solution help vegetation,waste environmental industry fashion pollution...,use old tea bag compost soil,waste business system material would platform ...
1297,accumulation improper disposal plastic causing...,waste plastic environmental resource industry ...,proposed solution implement innovative recycli...,waste business circular product would could ma...


# Section C: Financial

In [None]:
!pip install --upgrade sec-api
!pip install spacy
!python -m spacy download en_core_web_sm

2024-01-07 19:58:11.546524: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-07 19:58:11.546582: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-07 19:58:11.547885: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load

In [None]:
import pandas as pd
import sec_api
import spacy
from spacy.matcher import PhraseMatcher

In [None]:
# Assign your API key to a variable (for clarity, but not strictly necessary for setting the environment variable)
api_key = '0624f7e1667805196d9d242bc500a6e5fefd33e2ea61ee32f4e659f17eac5c50'

# Use the %env magic command to set the environment variable directly
%env SEC_API_KEY=$api_key

env: SEC_API_KEY=0624f7e1667805196d9d242bc500a6e5fefd33e2ea61ee32f4e659f17eac5c50


In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
import pandas as pd
import json
import re

cik_ticker_df = pd.read_csv('/content/cik.csv')
cik_ticker_json = cik_ticker_df.iloc[0, 0]

pattern = r'"cik_str":(\d+),"ticker":'
ciks_to_process = re.findall(pattern, json_string)

In [None]:
import sec_api.filings

ModuleNotFoundError: No module named 'sec_api.filings'

In [None]:
import requests
import pandas as pd
import sec_api

def get_all_filings(cik_list):
    """Retrieves 10-K filings efficiently for a list of CIKs."""
    filings = sec_api.filings(
        company_ciks=cik_list,  # Use company_ciks for multiple CIKs
        filing_types=["10-K"],
        start_date="2007-01-01",
        end_date="2019-12-31",
    )
    return filings

def find_similar_companies_for_all_rows(filtered_df, ciks_to_process):
    """Finds similar companies for each row in the DataFrame using provided CIKs."""

    all_filings = get_all_filings(ciks_to_process)  # Directly retrieve filings for specified CIKs

    for index, row in filtered_df.iterrows():
        solution_description = row["Solution Description"]
        similar_companies = find_similar_companies(solution_description, all_filings)
        min_gross_margin, max_gross_margin, median_gross_margin = calculate_gross_margin_statistics(
            all_filings
        )

        # Update the DataFrame with the results
        filtered_df.loc[index, "Similar Companies"] = similar_companies
        filtered_df.loc[index, "Min Gross Margin"] = min_gross_margin
        filtered_df.loc[index, "Max Gross Margin"] = max_gross_margin
        filtered_df.loc[index, "Median Gross Margin"] = median_gross_margin

# Call the function with the specified CIKs
find_similar_companies_for_all_rows(filtered_df, ciks_to_process)

AttributeError: module 'sec_api' has no attribute 'filings'

In [None]:
import requests
import pandas as pd

# Load CIK-ticker mapping from the SEC website
cik_ticker_url = "https://www.sec.gov/files/company_tickers.json"
response = requests.get(cik_ticker_url)
cik_ticker_data = response.json()
cik_to_ticker = {cik: ticker for cik, ticker in cik_ticker_data.items()}

def get_all_filings(cik_list):
    """Retrieves 10-K filings efficiently for a list of CIKs."""
    filings = sec_api.filings(
        company_ciks=cik_list,  # Use company_ciks for multiple CIKs
        filing_types=["10-K"],
        start_date="2007-01-01",
        end_date="2019-12-31",
    )
    return filings

def find_similar_companies_for_all_rows(filtered_df):
    """Finds similar companies for each row in the DataFrame."""

    # Retrieve CIKs using tickers from the DataFrame
    tickers = filtered_df["Ticker"]  # Assuming a "Ticker" column exists
    cik_list = [cik_to_ticker.get(ticker) for ticker in tickers]

    all_filings = get_all_filings(cik_list)

    for index, row in filtered_df.iterrows():
        solution_description = row["Solution Description"]
        similar_companies = find_similar_companies(solution_description, all_filings)
        min_gross_margin, max_gross_margin, median_gross_margin = calculate_gross_margin_statistics(
            all_filings
        )

        # Update the DataFrame with the results
        filtered_df.loc[index, "Similar Companies"] = similar_companies
        filtered_df.loc[index, "Min Gross Margin"] = min_gross_margin
        filtered_df.loc[index, "Max Gross Margin"] = max_gross_margin
        filtered_df.loc[index, "Median Gross Margin"] = median_gross_margin

# Call the function to process the DataFrame
find_similar_companies_for_all_rows(filtered_df)

In [None]:
def get_10k_reports(cik):
    filings = sec_api.filings(
        company_cik=cik,
        filing_types=["10-K"],
        start_date="2007-01-01",
        end_date="2019-12-31",
    )
    return filings

In [None]:
import statistics

def calculate_gross_margin_statistics(filings):
    gross_margin_values = []
    for filing in filings:
        report_url = filing["report_url"]
        report_text = sec_api.document_text(report_url)
        # Extract the gross margin values from the report_text and append them to gross_margin_values
        # ...
    min_gross_margin = min(gross_margin_values)
    max_gross_margin = max(gross_margin_values)
    median_gross_margin = statistics.median(gross_margin_values)
    return min_gross_margin, max_gross_margin, median_gross_margin

In [None]:
def find_similar_companies(solution_description, filings):
    matcher = PhraseMatcher(nlp.vocab)
    solution_description_doc = nlp(solution_description)
    matcher.add("SolutionDescription", None, solution_description_doc)
    similar_companies = []
    for filing in filings:
        report_url = filing["report_url"]
        report_text = sec_api.document_text(report_url)
        report_text_doc = nlp(report_text)
        matches = matcher(report_text_doc)
        if matches:
            similar_companies.append(filing["company_name"])
    return similar_companies

In [None]:
import requests
import time

api_key = "YOUR_SEC_API_KEY"  # Replace with your actual API key
ciks_to_process = ["0000000000", ...]  # Start with a diverse set of seed CIKs
processed_ciks = set()

def get_ciks_from_company_facts(cik):
    url = f"https://data.sec.gov/api/xbrl/companyfacts/{cik}"
    headers = {"Authorization": f"Bearer {api_key}"}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        return data.get("OperatingCompanies", [])
    else:
        print(f"Error fetching data for CIK {cik}: {response.text}")
        return []

# Main loop with enhancements
while ciks_to_process:
    cik = ciks_to_process.pop()
    if cik not in processed_ciks:
        processed_ciks.add(cik)

        try:
            new_ciks = get_ciks_from_company_facts(cik)
            ciks_to_process.extend(new_ciks)

            # Pause for rate limits and potential retries
            time.sleep(1)  # Adjust as needed based on API documentation

        except Exception as e:
            print(f"Error processing CIK {cik}: {e}")
            time.sleep(5)  # Retry after a delay for transient errors

# Acknowledge limitations and potential strategies
print("Collected a substantial set of CIKs:", processed_ciks)
print("Note: Retrieving CIKs for all companies directly from the API isn't feasible.")
print("Consider alternative data providers or strategies for comprehensive CIK lists.")


In [None]:
import requests
import pandas as pd
import json
from json import JSONDecodeError  # Import JSONDecodeError exception

cik_ticker_url = "https://www.sec.gov/files/company_tickers.json"
response = requests.get(cik_ticker_url)

ciks = None  # Initialize ciks to avoid NameError

try:
    # Attempt to parse as JSON
    cik_ticker_data = response.json()

    # Extract CIKs based on data structure
    if "cik_str" in cik_ticker_data:  # Top-level key
        ciks = cik_ticker_data["cik_str"]
    else:  # Within each company's data
        ciks = [company.get("cik_str") for company in cik_ticker_data.values()]

except JSONDecodeError:
    print("The response is not valid JSON. Attempting alternative strategies.")

    try:
        # Attempt to extract CIKs from raw text
        ciks = extract_cik_str_from_text(response.text)  # Implement this function
    except Exception as e:
        print(f"Failed to extract CIKs from text: {e}")

        # Explore other data sources or approaches:
        print("Considering alternative data sources or approaches.")
        # ... Implement logic for alternative strategies

except Exception as e:  # Broader exception handling
    print(f"An error occurred: {e}")

# Print CIKs or provide a message if extraction was unsuccessful
if ciks is not None:
    print(ciks)
else:
    print("CIK extraction was unsuccessful.")

# Ethical reminder
print("Important: Respect data usage guidelines and privacy regulations.")


In [None]:
def get_10k_reports(cik):
    filings = sec_api.filings(
        company_cik=cik,
        filing_types=["10-K"],
        start_date="2007-01-01",
        end_date="2019-12-31",
    )
    return filings

def calculate_gross_margin_statistics(filings):
    gross_margin_values = []
    for filing in filings:
        report_url = filing["report_url"]
        report_text = sec_api.document_text(report_url)
        # Extract the gross margin values from the report_text and append them to gross_margin_values
        # ...
    min_gross_margin = min(gross_margin_values)
    max_gross_margin = max(gross_margin_values)
    median_gross_margin = statistics.median(gross_margin_values)
    return min_gross_margin, max_gross_margin, median_gross_margin

def find_similar_companies(solution_description, filings):
    matcher = PhraseMatcher(nlp.vocab)
    solution_description_doc = nlp(solution_description)
    matcher.add("SolutionDescription", None, solution_description_doc)
    similar_companies = []
    for filing in filings:
        report_url = filing["report_url"]
        report_text = sec_api.document_text(report_url)
        report_text_doc = nlp(report_text)
        matches = matcher(report_text_doc)
        if matches:
            similar_companies.append(filing["company_name"])
    return similar_companies

for index, row in filtered_df.iterrows():
    solution_description = row["Solution Description"]
    similar_companies = find_similar_companies(solution_description, get_10k_reports(row["CIK"]))
    min_gross_margin, max_gross_margin, median_gross_margin = calculate_gross_margin_statistics(get_10k_reports(row["CIK"]))

    # Update the DataFrame with the results
    filtered_df.loc[index, "Similar Companies"] = similar_companies
    filtered_df.loc[index, "Min Gross Margin"] = min_gross_margin
    filtered_df.loc[index, "Max Gross Margin"] = max_gross_margin
    filtered_df.loc[index, "Median Gross Margin"] = median_gross_margin

In [None]:
import pandas as pd
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Load the dataset
df = problem_solution_df.head(100)

# Preprocess the data
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(df["problem"] + " " + df["solution"])

# Construct the graph
graph = nx.Graph()

# Add nodes to the graph
for i in range(len(df)):
    graph.add_node(i)

# Create edges based on similarity scores
similarity_matrix = cosine_similarity(tfidf_matrix)
edges = []
for i in range(len(df)):
    for j in range(i+1, len(df)):
        sim_score = similarity_matrix[i][j]
        edges.append((i, j, sim_score))

# Sort edges by similarity score in descending order
edges.sort(key=lambda x: x[2], reverse=True)

# Add edges to the graph
for edge in edges:
    graph.add_edge(edge[0], edge[1], weight=edge[2])

# Perform graph analysis (e.g., community detection)
communities = nx.algorithms.community.greedy_modularity_communities(graph)

# Print the identified communities
# for i, community in enumerate(communities):
#     print(f"Community {i+1}:")
#     for node in community:
#         print(f"- Idea {node}: {df.iloc[node]['problem']}")

# Generate recommendations based on similar ideas
unique_idea = max(graph.edges, key=lambda x: graph.edges[x]["weight"])[-1]

# Generate recommendations based on similar ideas to the best idea
similar_ideas = sorted(graph[unique_idea].items(), key=lambda x: x[1]["weight"], reverse=True)[:5]
recommendations = [idea[0] for idea in similar_ideas]

print(f"\nRecommendations for Idea {unique_idea}:")
for recommendation in recommendations:
    print(f"- Idea {recommendation}: {df.iloc[recommendation]['solution']}")

In [None]:
# Visualize the graph
# Set the node size based on the number of connections
node_size = [len(graph.edges(node)) * 100 for node in graph.nodes]

# Adjust the layout algorithm to spread out the nodes more evenly
pos = nx.spring_layout(graph, k=0.2)

# Increase the figure size to provide more space
plt.figure(figsize=(16, 12))

# Draw nodes with adjusted sizes and colors
nx.draw_networkx_nodes(graph, pos, node_size=node_size, node_color='lightblue')

# Draw edges with reduced width and transparency
nx.draw_networkx_edges(graph, pos, width=0.5, alpha=0.3)

# Add labels with larger font size and lighter color
nx.draw_networkx_labels(graph, pos, font_size=12, font_color='gray')

# Remove the axis lines and ticks
plt.axis('off')

# Set a descriptive title for the graph
plt.title("Circular Economy Business Ideas Graph")

# Show the improved graph
plt.show()

In [None]:
from flask import Flask, request

app = Flask(__name__)

def generate_recommendations(target_idea):
    # Load the dataset
    df = problem_solution_df

    # Preprocess the data
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(df["problem"] + " " + df["solution"])

    # Construct the graph
    graph = nx.Graph()

    # Add nodes to the graph
    for i in range(len(df)):
        graph.add_node(i)

    # Create edges based on similarity scores
    similarity_matrix = cosine_similarity(tfidf_matrix)
    edges = []
    for i in range(len(df)):
        for j in range(i + 1, len(df)):
            sim_score = similarity_matrix[i][j]
            edges.append((i, j, sim_score))

    # Sort edges by similarity score in descending order
    edges.sort(key=lambda x: x[2], reverse=True)

    # Add edges to the graph
    for edge in edges:
        graph.add_edge(edge[0], edge[1], weight=edge[2])

    # Generate recommendations based on similar ideas
    similar_ideas = sorted(graph[target_idea].items(), key=lambda x: x[1]["weight"], reverse=True)[:5]
    recommendations = [idea[0] for idea in similar_ideas]

    return recommendations

@app.route('/recommendations', methods=['POST'])
def get_recommendations():
    data = request.json
    target_idea = data['target_idea']

    recommendations = generate_recommendations(target_idea)

    response = {
        'recommendations': recommendations
    }

    return response

if __name__ == '__main__':
    app.run()

In [None]:
pip install pyvis

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pyvis.network import Network

df = problem_solution_df.head(30)

# Preprocess the data
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(df["problem"] + " " + df["solution"])

# Construct the graph
graph = nx.Graph()

# Add nodes to the graph
for i in range(len(df)):
    graph.add_node(i)

# Create edges based on similarity scores
similarity_matrix = cosine_similarity(tfidf_matrix)
edges = []
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        sim_score = similarity_matrix[i][j]
        edges.append((i, j, sim_score))

# Sort edges by similarity score in descending order
edges.sort(key=lambda x: x[2], reverse=True)

# Add edges to the graph
for edge in edges:
    graph.add_edge(edge[0], edge[1], weight=edge[2])

# Perform graph analysis (e.g., community detection)
communities = nx.algorithms.community.greedy_modularity_communities(graph)

# Print the identified communities
for i, community in enumerate(communities):
    print(f"Community {i+1}:")
    for node in community:
        print(f"- Idea {node}: {df.iloc[node]['problem']}")

# Generate recommendations based on the "best" idea
best_idea = len(df) - 1  # Example: Recommend solutions based on the last idea
similar_ideas = sorted(graph[best_idea].items(), key=lambda x: x[1]["weight"], reverse=True)[:5]
recommendations = [idea[0] for idea in similar_ideas]

print(f"\nRecommendations for Idea {best_idea}:")
for recommendation in recommendations:
    print(f"- Idea {recommendation}: {df.iloc[recommendation]['solution']}")

# Create an interactive graph using pyvis
nt = Network(notebook=True)
nt.from_nx(graph)

# Add node attributes (e.g., problem and solution)
for i, node in enumerate(nt.nodes):
    node["title"] = f"Idea {i}: {df.iloc[i]['problem']}"
    node["value"] = len(communities) - next((j for j, c in enumerate(communities) if i in c), -1)
    node["color"] = node["value"]

# Configure edge colors based on weights
weights = []
for u, v, data in graph.edges(data=True):
    if "weight" in data:
        weights.append(data["weight"])
    else:
        weights.append(0)

min_weight, max_weight = min(weights), max(weights)
edge_colors = []
for weight in weights:
    if max_weight != min_weight:
        normalized_weight = (weight - min_weight) / (max_weight - min_weight)
    else:
        normalized_weight = 0
    edge_colors.append(f"rgba(0, 0, 0, {normalized_weight})")

# Set node and edge styles
nt.set_edge_smooth("continuous")

# Assign edge colors
for idx, color in enumerate(edge_colors):
    nt.edges[idx]["color"] = color

# Visualize the graph
nt.show("graph.html")

In [None]:
df = problem_solution_df.head(30)

# Preprocess the data
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(df["problem"] + " " + df["solution"])

# Construct the graph
graph = nx.Graph()

# Add nodes to the graph
for i in range(len(df)):
    graph.add_node(i)

# Create edges based on similarity scores
similarity_matrix = cosine_similarity(tfidf_matrix)
edges = []
for i in range(len(df)):
    for j in range(i+1, len(df)):
        sim_score = similarity_matrix[i][j]
        edges.append((i, j, sim_score))

# Sort edges by similarity score in descending order
edges.sort(key=lambda x: x[2], reverse=True)

# Add edges to the graph
for edge in edges:
    graph.add_edge(edge[0], edge[1], weight=edge[2])

# Perform graph analysis (e.g., community detection)
communities = nx.algorithms.community.greedy_modularity_communities(graph)

# Print the identified communities
for i, community in enumerate(communities):
    print(f"Community {i+1}:")
    for node in community:
        print(f"- Idea {node}: {df.iloc[node]['problem']}")

# Generate recommendations based on similar ideas
target_idea = 0  # Example: Recommend solutions similar to the first idea
similar_ideas = sorted(graph[target_idea].items(), key=lambda x: x[1]["weight"], reverse=True)[:5]
recommendations = [idea[0] for idea in similar_ideas]

print(f"\nRecommendations for Idea {target_idea}:")
for recommendation in recommendations:
    print(f"- Idea {recommendation}: {df.iloc[recommendation]['solution']}")

# Visualize the graph
pos = nx.spring_layout(graph)
plt.figure(figsize=(12, 8))
nx.draw_networkx_nodes(graph, pos, node_size=200, node_color='lightblue')
nx.draw_networkx_edges(graph, pos, width=1.0, alpha=0.5)
nx.draw_networkx_labels(graph, pos, font_size=10, font_color='black')
plt.title("Circular Economy Business Ideas Graph")
plt.axis('off')
plt.show()