In [3]:
import pandas as pd

In [4]:
store_df = pd.read_csv('googleplaystore.csv')
reviews_df = pd.read_csv('googleplaystore_user_reviews.csv')

In [5]:
store_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [6]:
store_df.isna().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [7]:
reviews_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [8]:
reviews_df.isnull().sum()

App                           0
Translated_Review         26868
Sentiment                 26863
Sentiment_Polarity        26863
Sentiment_Subjectivity    26863
dtype: int64

In [9]:
store_df_cleaned = store_df.drop_duplicates(subset='App')

In [10]:
reviews_df_cleaned = reviews_df.dropna(subset=['Sentiment', 'Sentiment_Polarity', 'Sentiment_Subjectivity'])

In [11]:
aggregated_reviews = reviews_df_cleaned.groupby('App').agg({
    'Sentiment_Polarity': 'mean',
    'Sentiment_Subjectivity': 'mean'
}).reset_index()


In [12]:
df = pd.merge(store_df_cleaned, aggregated_reviews, on='App', how='left')

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [14]:
df = df.dropna(subset=['Category', 'Rating', 'Reviews', 'Size', 'Installs'])

In [15]:
# Clean the Price column properly
df['Price'] = df['Price'].replace('Free', '0')
df['Price'] = df['Price'].str.replace('$', '', regex=True)
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

In [16]:
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')

# Clean Installs column (remove '+' and ',' then convert to float)
df['Installs'] = df['Installs'].str.replace('[+,]', '', regex=True)
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')

In [17]:
def convert_size(size):
    if 'M' in size:
        return float(size.replace('M', '')) * 1e6
    elif 'k' in size:
        return float(size.replace('k', '')) * 1e3
    else:
        return None

In [18]:
df['Size'] = df['Size'].replace('Varies with device', None)
df['Size'] = df['Size'].dropna().map(convert_size)

In [19]:
df['Sentiment_Polarity'] = df['Sentiment_Polarity'].fillna(0)
df['Sentiment_Subjectivity'] = df['Sentiment_Subjectivity'].fillna(0)

In [20]:
le_type = LabelEncoder()
df['Type'] = le_type.fit_transform(df['Type'])

In [21]:
le_category = LabelEncoder()
df['Category'] = le_category.fit_transform(df['Category'])

In [22]:
features = ['Rating', 'Reviews', 'Size', 'Installs', 'Price', 'Type', 'Sentiment_Polarity', 'Sentiment_Subjectivity']
X = df[features]
y = df['Category']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [25]:
y_pred = clf.predict(X_test)

In [26]:
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.1926829268292683
