### Load API_KEYS, Merge and Preprocess Datasets

In [40]:
 
import os
from dotenv import load_dotenv
import openai

# Load environment variables from the .env file
load_dotenv()

# Retrieve the API key from the environment
openai_api_key = os.getenv('OPENAI_API_KEY')

# Set the OpenAI API key
openai.api_key = openai_api_key

In [41]:
import pandas as pd

azure_marketplace = pd.read_csv('Azure marketplace growth.csv')
aws_marketplace = pd.read_csv('Aws marketplace growth.csv')
chatgpt_store_growth = pd.read_csv('ChatGPT Store growth.csv')
github_marketplace = pd.read_csv('Github marketplace growth.csv')
github_repo_stars_growth = pd.read_csv('Github repo stars growth.csv')
google_workspace_marketplace = pd.read_csv('Google Workspace Marketplace.csv')
salesforce_marketplace = pd.read_csv('Salesforce Marketplace growth.csv')
shopify_marketplace = pd.read_csv('Shopify marketplace growth.csv')
visual_studio_marketplace = pd.read_csv('Visual Studio Marketplace Growth.csv')








In [42]:
datasets = [
    azure_marketplace, aws_marketplace, chatgpt_store_growth, github_marketplace, 
    github_repo_stars_growth, google_workspace_marketplace, salesforce_marketplace, 
    shopify_marketplace, visual_studio_marketplace
]



In [43]:
unique_datasets = []
for dataset in datasets:
    if not any(dataset.equals(existing) for existing in unique_datasets):
        unique_datasets.append(dataset)

In [44]:
combined_data = pd.concat(unique_datasets, ignore_index=True)


cleaned_data = combined_data.drop_duplicates()


cleaned_data.to_csv("cleaned_marketplace_data.csv", index=False)



In [45]:


cleaned_data.fillna({
    'last_month_installs': 0,
    'this_month_installs': 0,
    'installs_m_m_growth': 0,
    'description': 'Unknown',
    'domain': 'Unknown',
    'last_month_stars': 0,
    'this_month_stars': 0,
    'stars_m_m_growth': 0,
}, inplace=True)

# Handle any further inconsistencies like trailing spaces in text fields
cleaned_data['name'] = cleaned_data['name'].str.strip()
cleaned_data['description'] = cleaned_data['description'].str.strip()
cleaned_data['domain'] = cleaned_data['domain'].str.strip()

# Display the cleaned and processed data
cleaned_data.info()

# Save the cleaned data to a new CSV for later use
output_path = "cleaned_marketplace_data.csv"
cleaned_data.to_csv(output_path, index=False)





<class 'pandas.core.frame.DataFrame'>
Index: 17757 entries, 0 to 17866
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   name                 17757 non-null  object 
 1   url                  17355 non-null  object 
 2   last_month_installs  17757 non-null  float64
 3   this_month_installs  17757 non-null  float64
 4   installs_m_m_growth  17757 non-null  float64
 5   last_month_multiple  17740 non-null  float64
 6   description          17757 non-null  object 
 7   domain               17757 non-null  object 
 8   last_month_stars     17757 non-null  float64
 9   this_month_stars     17757 non-null  float64
 10  stars_m_m_growth     17757 non-null  float64
dtypes: float64(7), object(4)
memory usage: 1.6+ MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data.fillna({
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['name'] = cleaned_data['name'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['description'] = cleaned_data['description'].str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value

### Perform Clustering 

In [47]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans







data = pd.read_csv('cleaned_marketplace_data.csv')

data['description'].fillna('Unknown', inplace=True)


print(data['description'].isna().sum()) 



vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['description'])

n_clusters = 10  
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
data['cluster'] = kmeans.fit_predict(X)


0


### Generate Trend Summary Report with LLM

In [48]:
import openai

# Define a function to generate trend summaries using GPT-3.5-turbo
def generate_trend_summary(cluster_data):
    descriptions = " ".join(cluster_data['description'].tolist()[:10]) 
    messages = [
        {"role": "system", "content": "You are a helpful assistant that generates titles and descriptions for market trends."},
        {"role": "user", "content": f"Based on the following app descriptions, generate a title and description for this trend: {descriptions}"}
    ]
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  
        messages=messages,
        max_tokens=500
    )
    
    return response['choices'][0]['message']['content']

# Apply the trend summary generation to each cluster
trend_summaries = []

for cluster_id in data['cluster'].unique():
    cluster_data = data[data['cluster'] == cluster_id]
    summary = generate_trend_summary(cluster_data)
    trend_summaries.append({
        'cluster_id': cluster_id,
        'summary': summary,
        'growth_pace': cluster_data['installs_m_m_growth'].mean(),
        'examples': cluster_data[['name', 'url']].head(7).to_dict(orient='records')
    })


In [49]:
import json


for trend in trend_summaries:
    trend['cluster_id'] = int(trend['cluster_id']) 
    trend['growth_pace'] = float(trend['growth_pace']) 
    

# Save the trend summaries to a JSON file
with open('trend_summaries.json', 'w') as f:
    json.dump(trend_summaries, f, indent=4)


for trend in trend_summaries:
    print(f"Cluster {trend['cluster_id']} - Summary:\n{trend['summary']}")
    print(f"Growth Pace: {trend['growth_pace']}")
    print(f"Examples: {trend['examples']}")
    print("\n")


Cluster 3 - Summary:
Title: "The Rise of Cryptic Messaging Apps"

Description: As technology advances, a new trend is emerging with messaging apps that prioritize privacy and security. These apps are gaining popularity among users seeking to communicate without leaving a digital footprint. With features like end-to-end encryption and limited data collection, these cryptic messaging apps allow users to converse in anonymity. Stay ahead of the curve and embrace the era of unknown communication with these cutting-edge platforms.
Growth Pace: 223.89087093389296
Examples: [{'name': 'VMware Tanzu Greenplum (BYOL)', 'url': 'https://azuremarketplace.microsoft.com/en-us/marketplace/apps/pivotal.pivotal-greenplum-azure-byol?exp=ubp8&tab=Overview'}, {'name': 'Prefect 1', 'url': 'https://azuremarketplace.microsoft.com/en-us/marketplace/apps/prefecttechnologiesinc1606247647944.prefect_cloud?exp=ubp8&tab=Overview'}, {'name': 'CData API Server', 'url': 'https://azuremarketplace.microsoft.com/en-us/ma