In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load your data
data = pd.read_csv('csv/df_merged_cleaned.csv')

# Define success based on downloads
data['success'] = (data['installs'] > 100000).astype(int)

# Define features and target variable
features = ['category', 'size', 'type', 'price', 'content rating', 'genres', 'current ver', 'android ver', 'sentiment']
target = 'success'

# Prepare data
X = data[features]
y = data[target]

# Identify missing values
print(X.isna().sum())

# Handle missing values
# For numerical features, you can use SimpleImputer to fill missing values with the median
numeric_features = ['size', 'price']
categorical_features = ['category', 'type', 'content rating', 'genres', 'current ver', 'android ver', 'sentiment']

# Create a column transformer with imputers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),  # Impute missing values for numeric features
            ('scaler', StandardScaler())  # Normalize numeric features
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values for categorical features
            ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical features
        ]), categorical_features)
    ])

# Create a pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


category             1
size              1541
type                 2
price                1
content rating       1
genres               1
current ver          8
android ver          3
sentiment         8823
dtype: int64
              precision    recall  f1-score   support

           0       0.78      0.85      0.81      1682
           1       0.76      0.66      0.71      1210

    accuracy                           0.77      2892
   macro avg       0.77      0.76      0.76      2892
weighted avg       0.77      0.77      0.77      2892



## Interpreting Results
* Class 0 (Not Successful):
The model performs better in terms of recall (86%), meaning it identifies a higher proportion of true negatives (non-successful apps). However, precision is slightly lower at 78%, indicating some false positives.
* Class 1 (Successful):
The model has lower recall (66%) for successful apps, meaning it identifies fewer true positives. Precision is also slightly lower at 77%, suggesting some false positives in predictions of success.
* Accuracy:
An accuracy of 78% is decent, but it might not reflect the model’s performance well if there is a class imbalance (i.e., if one class is much more frequent than the other).
Balanced Performance:
* The F1-score for class 0 is higher than for class 1, indicating that the model performs better in identifying non-successful apps. Improving recall for class 1 could be beneficial.