In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer


Load the Dataset

In [16]:
apps_data = pd.read_csv("../data/combined/combined_apps.csv")
apps_data.head()

Unnamed: 0,package_name,category,license,source_code,author,suggested_version,anti_features,anti_feature_score,added_date,last_updated_date,...,rating,reviews,size_mb,installs_clean,type,price_clean,content_rating,genres,current_ver,android_ver
0,info.puzz.a10000sentences,Science & Education,Apache-2.0,https://github.com/tkrajina/10000sentences,SCIO,0.3.4,NonFreeNet,1,2019-09-18T17:00:00,2019-09-18 17:00:00,...,,,,,,,,,,
1,com.github.ashutoshgngwr.tenbitclockwidget,"Theming, Time",GPL-3.0-only,https://github.com/ashutoshgngwr/10-bitClockWi...,,2.2-1,,0,2020-10-01T17:00:00,2023-10-08 11:25:01,...,,,,,,,,,,
2,dev.lonami.klooni,Games,GPL-3.0-or-later,https://github.com/LonamiWebs/Klooni1010,,0.8.6,,0,2020-05-08T17:00:00,2020-08-31 17:00:00,...,,,,,,,,,,
3,eu.quelltext.counting,"Games, Science & Education",AGPL-3.0-or-later,https://gitlab.com/niccokunzmann/12345,Nicco Kunzmann,1.3,,0,2022-03-01T16:00:00,2022-03-03 16:00:00,...,,,,,,,,,,
4,com.lucasdnd.bitclock16,Time,GPL-2.0-or-later,https://github.com/lucasdnd/16-bit-clock,,1.0,,0,2015-03-29T17:00:00,2015-03-29 17:00:00,...,,,,,,,,,,


Define Features + Target (Classification)

In [17]:
features = ["size_mb", "update_freq_days", "anti_feature_score", "category"] #optionally add a price feature for better predictions
target = "platform"  # 0 = F-Droid, 1 = Google Play

X = apps_data[features]
y = apps_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# print(apps_data[["update_freq_days", "anti_feature_score"]].isna().sum())
# print(apps_data[["update_freq_days", "anti_feature_score"]].dropna())


Create Preprocessor

In [19]:
numeric_features = ["size_mb", "update_freq_days", "anti_feature_score"]
categorical_features = ["category"]

# Numeric pipeline with imputer + scaler
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical pipeline with imputer + one-hot encoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine into one preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])



Classification Pipeline

In [20]:
numeric_imputer = SimpleImputer(strategy="mean")
categorical_imputer = SimpleImputer(strategy="constant", fill_value="missing")

# Update transformers to include imputation
numeric_transformer = Pipeline(steps=[
    ("imputer", numeric_imputer),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", categorical_imputer),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)


Regression Pipeline (for predicting ratings)

In [21]:
target = "rating"
apps_data = apps_data.dropna(subset=["rating"])
X = apps_data[features]
y = apps_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

reg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

reg_pipeline.fit(X_train, y_train)
y_pred = reg_pipeline.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")


RMSE: 0.51
R²: -0.04


