In [22]:
import pandas as pd

df = pd.read_csv('csv/df_merged_cleaned.csv')


In [23]:
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans


# Define features to include in clustering
features = ['rating', 'reviews', 'size', 'installs', 'price', 'average_sentiment_analysis', 'average_sentiment_subjectivity']

# Create a feature matrix
X = df[features].copy()

# Separate features into numerical for preprocessing
numerical_features = ['rating', 'reviews', 'size', 'installs', 'price', 'average_sentiment_analysis', 'average_sentiment_subjectivity']

# Define preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with the median
    ('scaler', StandardScaler())  # Normalize features
])

# Create a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ])

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

# Choose the number of clusters (e.g., 20 clusters)
n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_processed)
labels = kmeans.labels_

# Add cluster labels to the DataFrame
df['cluster'] = labels

# Perform PCA for 2D visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_processed)

# Create a DataFrame for the PCA results
pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
pca_df['cluster'] = labels
pca_df['app'] = df['app']

# Create an interactive scatter plot
fig = px.scatter(pca_df, x='PC1', y='PC2', color='cluster', 
                 hover_name='app', 
                 title='Cluster Visualization with App Names',
                 labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2'},
                 color_continuous_scale=px.colors.qualitative.Plotly)

fig.update_layout(legend_title='Cluster')
fig.show()

# Optionally save the plot to an HTML file
fig.write_html('interactive_clusters.html')


In [25]:
# Combine the KMeans model and the preprocessor into a dictionary
model_dict = {
    'kmeans': kmeans,
    'preprocessor': preprocessor
}

# Save the combined dictionary as a single file
joblib.dump(model_dict, 'models/kmeans_with_preprocessor.joblib')



['models/kmeans_with_preprocessor.joblib']

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.float_format', '{:.2f}'.format)

def find_similar_apps(app_name, df, X_processed, n_similar=5):
    # Check if the app_name exists
    if app_name not in df['app'].values:
        return f"App '{app_name}' not found in the dataset."
    
    # Find the app's index
    app_index = df[df['app'] == app_name].index[0]
    
    # Get the app's feature vector
    app_vector = X_processed[app_index].reshape(1, -1)
    
    # Calculate cosine similarity between the app and all other apps
    similarities = cosine_similarity(app_vector, X_processed).flatten()
    
    # Get indices of the most similar apps
    similar_indices = similarities.argsort()[-(n_similar + 1):-1]
    
    # Exclude the app itself
    similar_apps = df.iloc[similar_indices]
    
    # Aggregate statistics
    aggregation = similar_apps[['rating', 'reviews', 'size', 'installs', 'price']].agg(['mean', 'std'])
    
    return similar_apps[['app', 'rating', 'reviews', 'size', 'installs', 'price']], aggregation

# Example usage
app_name = 'facebook'
similar_apps, aggregation = find_similar_apps(app_name, df, X_processed)

print("Similar Apps:")
print(similar_apps)

print("\nAggregated Statistics:")
print(aggregation)


Similar Apps:
                                           app  rating     reviews  size  \
2276                                   retrica    4.30  6120977.00   NaN   
299   messenger_–_text_and_video_chat_for_free    4.00 56642847.00   NaN   
2003                                 instagram    4.50 66577313.00   NaN   
3179   clean_master-_space_cleaner_&_antivirus    4.70 42916526.00   NaN   
300                         whatsapp_messenger    4.40 69119316.00   NaN   

          installs  price  
2276  100000000.00   0.00  
299  1000000000.00   0.00  
2003 1000000000.00   0.00  
3179  500000000.00   0.00  
300  1000000000.00   0.00  

Aggregated Statistics:
      rating     reviews  size     installs  price
mean    4.38 48275395.80   NaN 720000000.00   0.00
std     0.26 25715539.72   NaN 408656334.83   0.00


In [20]:
similar_apps

Unnamed: 0,app,rating,reviews,size,installs,price
2276,retrica,4.3,6120977.0,,100000000.0,0.0
299,messenger_–_text_and_video_chat_for_free,4.0,56642847.0,,1000000000.0,0.0
2003,instagram,4.5,66577313.0,,1000000000.0,0.0
3179,clean_master-_space_cleaner_&_antivirus,4.7,42916526.0,,500000000.0,0.0
300,whatsapp_messenger,4.4,69119316.0,,1000000000.0,0.0


In [21]:
aggregation

Unnamed: 0,rating,reviews,size,installs,price
mean,4.38,48275395.8,,720000000.0,0.0
std,0.26,25715539.72,,408656334.83,0.0
