In [51]:
import joblib
import numpy as np
import pandas as pd
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans

# Load the combined file
loaded_model = joblib.load('models/kmeans_with_preprocessor.joblib')

# Access the KMeans model and preprocessor
kmeans_loaded = loaded_model['kmeans']
preprocessor = loaded_model['preprocessor']

df = pd.read_csv('csv/df_merged_cleaned.csv')


# Define features to include in clustering
features = ['rating', 'reviews', 'size', 'installs', 'price', 'average_sentiment_analysis', 'average_sentiment_subjectivity']

# Create a feature matrix
X = df[features].copy()

# Separate features into numerical for preprocessing
numerical_features = ['rating', 'reviews', 'size', 'installs', 'price', 'average_sentiment_analysis', 'average_sentiment_subjectivity']

# Define preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with the median
    ('scaler', StandardScaler())  # Normalize features
])


# Apply preprocessing
X_processed = preprocessor.fit_transform(X)



In [52]:
X

Unnamed: 0,rating,reviews,size,installs,price,average_sentiment_analysis,average_sentiment_subjectivity
0,4.10,159.00,19.00,10000.00,0.00,,
1,3.90,967.00,14.00,500000.00,0.00,0.15,0.64
2,4.70,87510.00,8.70,5000000.00,0.00,,
3,4.50,215644.00,25.00,50000000.00,0.00,,
4,4.30,967.00,2.80,100000.00,0.00,,
...,...,...,...,...,...,...,...
9634,4.50,38.00,53.00,5000.00,0.00,,
9635,5.00,4.00,3.60,100.00,0.00,,
9636,,3.00,9.50,1000.00,0.00,,
9637,4.50,114.00,,1000.00,0.00,,


In [53]:
X_processed

array([[-1.82063048e-01, -1.18067702e-01, -4.36020071e-02, ...,
        -6.36713577e-02, -4.14378590e-03,  1.74407762e-02],
       [-5.67587152e-01, -1.17626767e-01, -2.89993454e-01, ...,
        -6.36713577e-02, -9.74828532e-01,  5.83909959e+00],
       [ 9.74509263e-01, -7.03992754e-02, -5.51168387e-01, ...,
        -6.36713577e-02, -4.14378590e-03,  1.74407762e-02],
       ...,
       [ 2.03461056e-01, -1.18152833e-01, -5.11745756e-01, ...,
        -6.36713577e-02, -4.14378590e-03,  1.74407762e-02],
       [ 5.88985159e-01, -1.18092259e-01, -3.39271743e-01, ...,
        -6.36713577e-02, -4.14378590e-03,  1.74407762e-02],
       [ 5.88985159e-01,  9.92061558e-02, -4.36020071e-02, ...,
        -6.36713577e-02, -4.14378590e-03,  1.74407762e-02]])

In [54]:
from sklearn.metrics.pairwise import cosine_similarity


##this is the function
def find_similar_apps(app_name, df, X_processed, n_similar=5):
    # Check if the app_name exists
    if app_name not in df['app'].values:
        return f"App '{app_name}' not found in the dataset."
    
    # Find the app's index
    app_index = df[df['app'] == app_name].index[0]
    
    # Get the app's feature vector
    app_vector = X_processed[app_index].reshape(1, -1)
    
    # Calculate cosine similarity between the app and all other apps
    similarities = cosine_similarity(app_vector, X_processed).flatten()
    
    # Get indices of the most similar apps
    similar_indices = similarities.argsort()[-(n_similar + 1):-1]
    
    # Exclude the app itself
    similar_apps = df.iloc[similar_indices]
    
    # Aggregate statistics
    aggregation = similar_apps[['rating', 'reviews', 'size', 'installs', 'price']].agg(['mean', 'std'])
    
    return similar_apps[['app', 'rating', 'reviews', 'size', 'installs', 'price']], aggregation

In [55]:
similar_apps, aggregation = find_similar_apps('facebook', df, X_processed)


In [56]:
pd.set_option('display.float_format', '{:.2f}'.format)


In [57]:
similar_apps

Unnamed: 0,app,rating,reviews,size,installs,price
2276,retrica,4.3,6120977.0,,100000000.0,0.0
299,messenger_–_text_and_video_chat_for_free,4.0,56642847.0,,1000000000.0,0.0
2003,instagram,4.5,66577313.0,,1000000000.0,0.0
3179,clean_master-_space_cleaner_&_antivirus,4.7,42916526.0,,500000000.0,0.0
300,whatsapp_messenger,4.4,69119316.0,,1000000000.0,0.0


In [58]:
aggregation

Unnamed: 0,rating,reviews,size,installs,price
mean,4.38,48275395.8,,720000000.0,0.0
std,0.26,25715539.72,,408656334.83,0.0
