In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import Libraries and Data Set
import pandas as pd                                            
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier 
import sklearn.model_selection as ms     
from sklearn import tree                                   
from sklearn.metrics import classification_report, confusion_matrix 
from IPython.display import Image                 
import pydotplus                                            
import os  
import statsmodels.api as ms
import matplotlib.pyplot as plt
import seaborn as sns

# import data
file_path = '/content/drive/MyDrive/INFO-614 Group Project/google_playstore.xlsx'
df = pd.read_excel(file_path)
df.head()


In [None]:
# Remove Missing Values
display(df.isna().sum())
df = df.dropna()

In [None]:
df.shape

In [None]:
# Sampling 25% of the Data Set
import random
random.seed(1)

df = df.sample(n=222174, random_state=1)

df.shape

In [None]:
# Selecting Attributes
df = df[['category','rating_count', 'installs', 'price', 'size', 'content_rating', 'ad', 'rating']]

In [None]:
# Data PreProcessing
df['category'] = df['category'].astype('category').cat.codes
df['size'] = df['size'].astype('category').cat.codes
df['content_rating'] = df['content_rating'].astype('category').cat.codes

df.loc[df['rating'] <= 4.5, 'rating'] = 0 
df.loc[df['rating'] > 4.5, 'rating'] = 1

display(df['rating'].value_counts())

In [None]:
# Data Train and Test Split
import sklearn.model_selection as ms 
X = (df.iloc[:,:-1])
y = (df.iloc[:, -1])
y = y.astype('int')


X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size = 0.3, random_state = 1)

In [None]:
# Best Model (Decision Tree)
dt_clf = DecisionTreeClassifier(max_depth=8, min_samples_split = 10, splitter= 'best', random_state=1)

dt_clf = dt_clf.fit(X_train, y_train)
                    
y_pred = dt_clf.predict(X_test)

feature_names = df.columns.tolist()
feature_names = feature_names [0:8]
target_name = np.array (['1', '0'])

In [None]:
# Validation
import sklearn.metrics as mt

print('Train_Accuracy: ', dt_clf.score(X_train, y_train),'\n')

accuracy = mt.accuracy_score(y_test, y_pred)
recall = mt.recall_score(y_test, y_pred)
precision = mt.precision_score(y_test, y_pred)
f1_score = mt.f1_score(y_test, y_pred)
matrix = mt.confusion_matrix(y_test, y_pred)

print('Accuracy: ', format(accuracy,'.2f'),'\n')
print('Recall: ', format(recall,'.2f'),'\n')
print('Precision: ', format(precision,'.2f'),'\n')
print('F1_score: ', format(f1_score,'.2f'),'\n')
print('Confusion Matrix:','\n', matrix)

In [None]:
# Cross-Validation 
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import cross_val_predict

y_pred_cross = cross_val_predict(dt_clf, X, y, cv=10)
accuracy = mt.accuracy_score(y, y_pred_cross)
recall = mt.recall_score(y, y_pred_cross)
precision = mt.precision_score(y, y_pred_cross)
f1_score = mt.f1_score(y, y_pred_cross)
matrix = mt.confusion_matrix(y, y_pred_cross)

print('Accuracy: ', format(accuracy,'.2f'),'\n')
print('Recall: ', format(recall,'.2f'),'\n')
print('Precision: ', format(precision,'.2f'),'\n')
print('F1_score: ', format(f1_score,'.2f'),'\n')
print('Confusion Matrix:','\n', matrix)

# Applying the Model to the New Data

In [None]:
# import data
df_new = pd.read_excel('/content/drive/MyDrive/INFO-614 Group Project/google_playstore_new.xlsx')
df_new.head()

In [None]:
# Remove Missing Values
display(df.isna().sum())
df_new = df_new.dropna()

In [None]:
df_new.shape

In [None]:
# Selecting Attributes
X_new = df_new[['category','rating_count', 'installs', 'price', 'size', 'content_rating', 'ad']]

In [None]:
# Data PreProcessing
X_new['category'] = X_new['category'].astype('category').cat.codes
X_new['size'] = X_new['size'].astype('category').cat.codes
X_new['content_rating'] = X_new['content_rating'].astype('category').cat.codes
X_new['ad'] = X_new['ad'].astype('category').cat.codes # This attribute was integer in this first data but not in the new data set



In [None]:
Y_New = dt_clf.predict(X_new)
print(Y_New)

In [None]:
np.unique(Y_New, return_counts=True)

# Merging Our Prediction Output with Input Dataframe



In [None]:
df_new['Y_New'] = pd.Series(Y_New)

df2 = pd.merge(X_new,df_new['Y_New'].dropna() .to_frame(),how = 'left',left_index = True,   right_index = True)

In [None]:
df2.head()

In [None]:
df2.shape

In [None]:
display(df2['Y_New'].value_counts())

# Clustering the Predicted Popular Apps

In [None]:
# Subsetting our datafram into only popular apps
df2 = df2[df2['Y_New'] == 1]
df2

In [None]:
# Selecting two attributes for clustering
df2 = df2[['rating_count', 'installs']]
df2

In [None]:
df2.shape

In [None]:
from sklearn.preprocessing import StandardScaler

# Suppose we select two attributes
data = df2[['rating_count', 'installs']]

# Scaling
scaler = StandardScaler()
df_scale = pd.DataFrame(scaler.fit_transform(data), columns = data.columns)

In [None]:
from sklearn.cluster import DBSCAN
model = DBSCAN(eps=0.2, min_samples=20)
model.fit(df_scale)
df_scale['cluster'] = model.fit_predict(df_scale)



plt.figure(figsize = (8, 8))

for i in range(-1, df_scale['cluster'].max() + 1):
    plt.scatter(df_scale.loc[df_scale['cluster'] == i, 'rating_count'], df_scale.loc[df_scale['cluster'] == i, 'installs'], 
                    label = 'cluster ' + str(i))

plt.legend()
plt.title('eps = 0.2, min_samples = 20', size = 15)
plt.xlabel('rating_count', size = 12)
plt.ylabel('installs', size = 12)
plt.show()

In [None]:

f, ax = plt.subplots(2, 2)
f.set_size_inches((12, 12))

for i in range(4):
    # change eps values
    eps = 0.2 * (i + 1)
    min_samples = 20

    # Multi-plot
    model = DBSCAN(eps=eps, min_samples=min_samples)

    model.fit(df_scale)
    df_scale['cluster'] = model.fit_predict(df_scale)

    for j in range(-1, df_scale['cluster'].max() + 1):
        ax[i // 2, i % 2].scatter(df_scale.loc[df_scale['cluster'] == j, 'rating_count'], df_scale.loc[df_scale['cluster'] == j, 'installs'], 
                        label = 'cluster ' + str(j))

    ax[i // 2, i % 2].legend()
    ax[i // 2, i % 2].set_title('eps = %.1f, min_samples = %d'%(eps, min_samples), size = 15)
    ax[i // 2, i % 2].set_xlabel('rating_count', size = 12)
    ax[i // 2, i % 2].set_ylabel('installs', size = 12)
plt.show()

In [None]:
# Validation using Silhouette Coefficient
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn import metrics

model = DBSCAN(eps=0.5, min_samples=2)
model.fit(df_scale)
cluster_labels = model.fit_predict(df_scale)
sample_silhouette_values = silhouette_samples(df_scale, cluster_labels)

metrics.silhouette_score(df_scale, cluster_labels) 